@@ -37,34 +37,43 @@ from upath import UPath
3737```
3838
3939``` {code-cell}
40- s3_bucket = "irsa-fornax-testdata"
41- euclid_prefix = "EUCLID/q1/catalogues"
40+ import top
4241
43- euclid_hats_collection_uri = UPath(f"s3://{s3_bucket}/{euclid_prefix}") # for lsdb
44- euclid_parquet_metadata_path = f"{s3_bucket}/{euclid_prefix}/hats/dataset/_metadata" # for pyarrow
42+ RUN_ID = "euclid-hats-query-methods"
43+ top.tag(run_id=RUN_ID, time="start-run")
44+ ```
45+
46+ ``` {code-cell}
47+ s3_bucket = "nasa-irsa-euclid-q1"
48+ euclid_prefix = "contributed/q1/merged_objects/hats"
49+
50+ euclid_hats_collection_uri = f"s3://{s3_bucket}/{euclid_prefix}" # for lsdb
51+ euclid_parquet_metadata_path = f"{s3_bucket}/{euclid_prefix}/euclid_q1_merged_objects-hats/dataset/_metadata" # for pyarrow
4552
4653max_magnitude = 24.5
4754min_flux = 10 ** ((max_magnitude - 23.9) / -2.5)
4855```
4956
5057``` {code-cell}
5158# Columns we actually want to load.
52- OBJECT_ID = "OBJECT_ID "
53- PHZ_Z = "PHZ_PHZ_MEDIAN "
59+ OBJECT_ID = "object_id "
60+ PHZ_Z = "phz_phz_median "
5461columns = [OBJECT_ID, PHZ_Z]
5562```
5663
5764## Load with pyarrow
5865
5966``` {code-cell}
6067%%time
68+ top.tag(run_id=RUN_ID, time="pyarrow")
69+
6170# Construct filter for quality PHZ redshifts.
6271phz_filter = (
63- (pc.field("MER_VIS_DET ") == 1) # No NIR-only objects.
64- & (pc.field("MER_FLUX_DETECTION_TOTAL ") > min_flux) # I < 24.5
65- & (pc.divide(pc.field("MER_FLUX_DETECTION_TOTAL "), pc.field("MER_FLUXERR_DETECTION_TOTAL ")) > 5) # I band S/N > 5
66- & ~pc.field("PHZ_PHZ_CLASSIFICATION ").isin([1, 3, 5, 7]) # Exclude objects classified as star.
67- & (pc.field("MER_SPURIOUS_FLAG ") == 0) # MER quality
72+ (pc.field("mer_vis_det ") == 1) # No NIR-only objects.
73+ & (pc.field("mer_flux_detection_total ") > min_flux) # I < 24.5
74+ & (pc.divide(pc.field("mer_flux_detection_total "), pc.field("mer_fluxerr_detection_total ")) > 5) # I band S/N > 5
75+ & ~pc.field("phz_phz_classification ").isin([1, 3, 5, 7]) # Exclude objects classified as star.
76+ & (pc.field("mer_spurious_flag ") == 0) # MER quality
6877)
6978
7079# Load.
@@ -89,6 +98,9 @@ This query has no spatial component and needs to look at all the files, so below
8998
9099``` {code-cell}
91100%%time
101+ top.tag(run_id=RUN_ID, time="pyarrow + dask")
102+
103+
92104@dask.delayed
93105def load_fragment(frag):
94106 table = frag.to_table(filter=phz_filter, columns=columns)
@@ -119,17 +131,19 @@ pa_df.equals(padask_df)
119131
120132``` {code-cell}
121133%%time
134+ top.tag(run_id=RUN_ID, time="lsdb")
135+
122136# Construct the query equivalent of phz_filter.
123137query = (
124- "MER_VIS_DET == 1"
125- f" & MER_FLUX_DETECTION_TOTAL > {min_flux}"
126- " & MER_FLUX_DETECTION_TOTAL / MER_FLUXERR_DETECTION_TOTAL > 5"
127- " & PHZ_PHZ_CLASSIFICATION not in [1,3,5,7]"
128- " & MER_SPURIOUS_FLAG == 0"
138+ "mer_vis_det == 1"
139+ f" & mer_flux_detection_total > {min_flux}"
140+ " & mer_flux_detection_total / mer_fluxerr_detection_total > 5"
141+ " & phz_phz_classification not in [1,3,5,7]"
142+ " & mer_spurious_flag == 0"
129143)
130144
131145# We don't want to load these columns, but we have to in order to use them in the filter.
132- extra_columns = ["MER_VIS_DET ", "MER_FLUX_DETECTION_TOTAL ", "PHZ_PHZ_CLASSIFICATION ", "MER_SPURIOUS_FLAG ", "MER_FLUXERR_DETECTION_TOTAL "]
146+ extra_columns = ["mer_vis_det ", "mer_flux_detection_total ", "phz_phz_classification ", "mer_spurious_flag ", "mer_fluxerr_detection_total "]
133147
134148# Load.
135149client = dask.distributed.Client(n_workers=os.cpu_count(), threads_per_worker=2, memory_limit=None)
@@ -151,4 +165,11 @@ lsdb_df
151165``` {code-cell}
152166# Check for equality.
153167lsdb_df[PHZ_Z].astype("float32").equals(pa_df[PHZ_Z])
168+
169+ top.tag(run_id=RUN_ID, time="end-run")
170+
171+
172+ tl = top.load_top_output(run_id=RUN_ID, named_pids_only=False)
173+ fig = tl.plot_overview()
174+ fig.savefig(tl.base_dir / "top.png")
154175```
0 commit comments