Skip to content

Commit eed5903

Browse files
committed
Switch to public bucket and lower case column names
1 parent be31f10 commit eed5903

1 file changed

Lines changed: 38 additions & 17 deletions

File tree

tutorials/parquet-catalog-demos/euclid-hats-query-methods.md

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -37,34 +37,43 @@ from upath import UPath
3737
```
3838

3939
```{code-cell}
40-
s3_bucket = "irsa-fornax-testdata"
41-
euclid_prefix = "EUCLID/q1/catalogues"
40+
import top
4241
43-
euclid_hats_collection_uri = UPath(f"s3://{s3_bucket}/{euclid_prefix}") # for lsdb
44-
euclid_parquet_metadata_path = f"{s3_bucket}/{euclid_prefix}/hats/dataset/_metadata" # for pyarrow
42+
RUN_ID = "euclid-hats-query-methods"
43+
top.tag(run_id=RUN_ID, time="start-run")
44+
```
45+
46+
```{code-cell}
47+
s3_bucket = "nasa-irsa-euclid-q1"
48+
euclid_prefix = "contributed/q1/merged_objects/hats"
49+
50+
euclid_hats_collection_uri = f"s3://{s3_bucket}/{euclid_prefix}" # for lsdb
51+
euclid_parquet_metadata_path = f"{s3_bucket}/{euclid_prefix}/euclid_q1_merged_objects-hats/dataset/_metadata" # for pyarrow
4552
4653
max_magnitude = 24.5
4754
min_flux = 10 ** ((max_magnitude - 23.9) / -2.5)
4855
```
4956

5057
```{code-cell}
5158
# Columns we actually want to load.
52-
OBJECT_ID = "OBJECT_ID"
53-
PHZ_Z = "PHZ_PHZ_MEDIAN"
59+
OBJECT_ID = "object_id"
60+
PHZ_Z = "phz_phz_median"
5461
columns = [OBJECT_ID, PHZ_Z]
5562
```
5663

5764
## Load with pyarrow
5865

5966
```{code-cell}
6067
%%time
68+
top.tag(run_id=RUN_ID, time="pyarrow")
69+
6170
# Construct filter for quality PHZ redshifts.
6271
phz_filter = (
63-
(pc.field("MER_VIS_DET") == 1) # No NIR-only objects.
64-
& (pc.field("MER_FLUX_DETECTION_TOTAL") > min_flux) # I < 24.5
65-
& (pc.divide(pc.field("MER_FLUX_DETECTION_TOTAL"), pc.field("MER_FLUXERR_DETECTION_TOTAL")) > 5) # I band S/N > 5
66-
& ~pc.field("PHZ_PHZ_CLASSIFICATION").isin([1, 3, 5, 7]) # Exclude objects classified as star.
67-
& (pc.field("MER_SPURIOUS_FLAG") == 0) # MER quality
72+
(pc.field("mer_vis_det") == 1) # No NIR-only objects.
73+
& (pc.field("mer_flux_detection_total") > min_flux) # I < 24.5
74+
& (pc.divide(pc.field("mer_flux_detection_total"), pc.field("mer_fluxerr_detection_total")) > 5) # I band S/N > 5
75+
& ~pc.field("phz_phz_classification").isin([1, 3, 5, 7]) # Exclude objects classified as star.
76+
& (pc.field("mer_spurious_flag") == 0) # MER quality
6877
)
6978
7079
# Load.
@@ -89,6 +98,9 @@ This query has no spatial component and needs to look at all the files, so below
8998

9099
```{code-cell}
91100
%%time
101+
top.tag(run_id=RUN_ID, time="pyarrow + dask")
102+
103+
92104
@dask.delayed
93105
def load_fragment(frag):
94106
table = frag.to_table(filter=phz_filter, columns=columns)
@@ -119,17 +131,19 @@ pa_df.equals(padask_df)
119131

120132
```{code-cell}
121133
%%time
134+
top.tag(run_id=RUN_ID, time="lsdb")
135+
122136
# Construct the query equivalent of phz_filter.
123137
query = (
124-
"MER_VIS_DET == 1"
125-
f" & MER_FLUX_DETECTION_TOTAL > {min_flux}"
126-
" & MER_FLUX_DETECTION_TOTAL / MER_FLUXERR_DETECTION_TOTAL > 5"
127-
" & PHZ_PHZ_CLASSIFICATION not in [1,3,5,7]"
128-
" & MER_SPURIOUS_FLAG == 0"
138+
"mer_vis_det == 1"
139+
f" & mer_flux_detection_total > {min_flux}"
140+
" & mer_flux_detection_total / mer_fluxerr_detection_total > 5"
141+
" & phz_phz_classification not in [1,3,5,7]"
142+
" & mer_spurious_flag == 0"
129143
)
130144
131145
# We don't want to load these columns, but we have to in order to use them in the filter.
132-
extra_columns = ["MER_VIS_DET", "MER_FLUX_DETECTION_TOTAL", "PHZ_PHZ_CLASSIFICATION", "MER_SPURIOUS_FLAG", "MER_FLUXERR_DETECTION_TOTAL"]
146+
extra_columns = ["mer_vis_det", "mer_flux_detection_total", "phz_phz_classification", "mer_spurious_flag", "mer_fluxerr_detection_total"]
133147
134148
# Load.
135149
client = dask.distributed.Client(n_workers=os.cpu_count(), threads_per_worker=2, memory_limit=None)
@@ -151,4 +165,11 @@ lsdb_df
151165
```{code-cell}
152166
# Check for equality.
153167
lsdb_df[PHZ_Z].astype("float32").equals(pa_df[PHZ_Z])
168+
169+
top.tag(run_id=RUN_ID, time="end-run")
170+
171+
172+
tl = top.load_top_output(run_id=RUN_ID, named_pids_only=False)
173+
fig = tl.plot_overview()
174+
fig.savefig(tl.base_dir / "top.png")
154175
```

0 commit comments

Comments
 (0)