Skip to content

Commit 9222c6f

Browse files
authored
Merge pull request #8 from ajperillo19/main
Spring SULI 2026 Contributions - Antonio Perillo
2 parents 75c7729 + 0a5c2cc commit 9222c6f

4 files changed

Lines changed: 812 additions & 0 deletions

File tree

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
import argparse
2+
import os
3+
import matplotlib.pyplot as plt
4+
import scipy as sp
5+
import numpy as np
6+
import pandas as pd
7+
from matplotlib.colors import LogNorm
8+
from mpl_toolkits.axes_grid1 import make_axes_locatable
9+
10+
# column names expected in concatenated DataFrame (true MC values)
11+
PX_MC_COL = "mc_elec_px"
12+
PY_MC_COL = "mc_elec_py"
13+
PZ_MC_COL = "mc_elec_pz"
14+
xbjk_MC_COL = "mc_x"
15+
Q2_MC_COL = "mc_q2"
16+
Y_MC_COL = "mc_y"
17+
18+
# column names expected in concatenated DataFrame (Reconstructed Values) (Electron Method)
19+
PX_COL = "elec_px"
20+
PY_COL = "elec_py"
21+
PZ_COL = "elec_pz"
22+
xbjk_COL = "electron_x"
23+
Q2_COL = "electron_q2"
24+
E_COL = "elec_energy"
25+
Y_COL = "electron_y"
26+
27+
# Double Angle Method
28+
xbjk_DA_COL = "da_x"
29+
Q2_DA_COL = "da_q2"
30+
Y_DA_COL = "da_y"
31+
32+
# E Sigma Method
33+
xbjk_ESIG_COL = "esigma_x"
34+
Q2_ESIG_COL = "esigma_q2"
35+
Y_ESIG_COL = "esigma_y"
36+
37+
# Jaquet-Blondel Method
38+
xbjk_JB_COL = "jb_x"
39+
Q2_JB_COL = "jb_q2"
40+
Y_JB_COL = "jb_y"
41+
42+
# Sigma Method
43+
xbjk_SIG_COL = "sigma_x"
44+
Q2_SIG_COL = "sigma_q2"
45+
Y_SIG_COL = "sigma_y"
46+
47+
48+
def process_and_mask(file):
49+
"""
50+
Return a dict with numeric arrays for px,py,pz and derived kinematics.
51+
Only events where reconstructed kinematics are present are kept (so MC and reco align).
52+
"""
53+
54+
# ---------- load data ----------
55+
56+
suffix = file.lower()
57+
if ".parquet" or ".parq" in (suffix):
58+
df = pd.read_parquet(file)
59+
elif ".feather" in (suffix):
60+
df = pd.read_feather(file)
61+
elif ".csv" or ".txt" in (suffix):
62+
# read only the columns we need (fast & memory friendly)
63+
df = pd.read_csv(file)
64+
else:
65+
raise ValueError(f"Unsupported acceptance file type: {suffix}")
66+
df = pd.read_parquet(file, engine="pyarrow")
67+
if df is None or df.empty:
68+
print(" -> concatenated dataframe empty or None")
69+
70+
# ---------- coerce important columns to numeric (in-place) ----------
71+
reco_needed = [PX_COL, PY_COL, PZ_COL, xbjk_COL, Q2_COL, E_COL, Y_COL]
72+
mc_needed = [PX_MC_COL, PY_MC_COL, PZ_MC_COL, xbjk_MC_COL, Q2_MC_COL, Y_MC_COL]
73+
DA_needed = [xbjk_DA_COL, Q2_DA_COL, Y_DA_COL]
74+
ESIGMA_needed = [xbjk_ESIG_COL, Q2_ESIG_COL, Y_ESIG_COL]
75+
JB_needed = [xbjk_JB_COL, Q2_JB_COL, Y_JB_COL]
76+
SIGMA_needed = [xbjk_SIG_COL, Q2_SIG_COL, Y_SIG_COL]
77+
78+
df[reco_needed] = df[reco_needed].apply(pd.to_numeric, errors="coerce")
79+
df[mc_needed] = df[mc_needed].apply(pd.to_numeric, errors="coerce")
80+
df[DA_needed] = df[DA_needed].apply(pd.to_numeric, errors="coerce")
81+
df[ESIGMA_needed] = df[ESIGMA_needed].apply(pd.to_numeric, errors="coerce")
82+
df[JB_needed] = df[JB_needed].apply(pd.to_numeric, errors="coerce")
83+
df[SIGMA_needed] = df[SIGMA_needed].apply(pd.to_numeric, errors="coerce")
84+
85+
# ---------- build masks ----------
86+
reco_complete_mask = df[reco_needed].notna().all(axis=1)
87+
mc_complete_mask = df[mc_needed].notna().all(axis=1)
88+
DA_mask = df[DA_needed].notna().all(axis=1)
89+
ESIGMA_mask = df[ESIGMA_needed].notna().all(axis=1)
90+
JB_mask = df[JB_needed].notna().all(axis=1)
91+
SIGMA_mask = df[SIGMA_needed].notna().all(axis=1)
92+
93+
matched_mask = reco_complete_mask & mc_complete_mask & DA_mask & ESIGMA_mask & JB_mask & SIGMA_mask
94+
95+
n_total = len(df)
96+
n_reco = int(reco_complete_mask.sum())
97+
n_mc = int(mc_complete_mask.sum())
98+
n_matched = int(matched_mask.sum())
99+
print(f"Total events: {n_total}; reco-complete: {n_reco}; mc-complete: {n_mc}; matched: {n_matched}")
100+
101+
if n_matched == 0:
102+
print(" -> No matched (reco+mc) events found; returning None")
103+
104+
df_matched = df.loc[matched_mask].copy()
105+
106+
# ---------- MC arrays ----------
107+
px_mc = df_matched[PX_MC_COL].to_numpy()
108+
py_mc = df_matched[PY_MC_COL].to_numpy()
109+
pz_mc = df_matched[PZ_MC_COL].to_numpy()
110+
x_mc = df_matched[xbjk_MC_COL].to_numpy()
111+
q2_mc = df_matched[Q2_MC_COL].to_numpy()
112+
y_mc = df_matched[Y_MC_COL].to_numpy()
113+
114+
pt_mc = np.sqrt(px_mc**2 + py_mc**2)
115+
p_mc = np.sqrt(px_mc**2 + py_mc**2 + pz_mc**2)
116+
E_mc = p_mc.copy()
117+
with np.errstate(invalid="ignore", divide="ignore"):
118+
theta_mc = np.arccos(np.clip(pz_mc / p_mc, -1.0, 1.0))
119+
eta_mc = -np.log(np.tan(theta_mc / 2.0))
120+
phi_mc = np.arctan2(py_mc, px_mc)
121+
122+
# ---------- reco arrays ----------
123+
px = df_matched[PX_COL].to_numpy()
124+
py = df_matched[PY_COL].to_numpy()
125+
pz = df_matched[PZ_COL].to_numpy()
126+
x = df_matched[xbjk_COL].to_numpy()
127+
q2 = df_matched[Q2_COL].to_numpy()
128+
E_reco = df_matched[E_COL].to_numpy()
129+
y = df_matched[Y_COL].to_numpy()
130+
131+
pt = np.sqrt(px**2 + py**2)
132+
p = np.sqrt(px**2 + py**2 + pz**2)
133+
with np.errstate(invalid="ignore", divide="ignore"):
134+
theta = np.arccos(np.clip(pz / p, -1.0, 1.0))
135+
eta = -np.log(np.tan(theta / 2.0))
136+
phi = np.arctan2(py, px)
137+
138+
# ---------- alternative method arrays ----------
139+
x_DA = df_matched[xbjk_DA_COL].to_numpy()
140+
q2_DA = df_matched[Q2_DA_COL].to_numpy()
141+
y_DA = df_matched[Y_DA_COL].to_numpy()
142+
143+
x_JB = df_matched[xbjk_JB_COL].to_numpy()
144+
q2_JB = df_matched[Q2_JB_COL].to_numpy()
145+
y_JB = df_matched[Y_JB_COL].to_numpy()
146+
147+
x_SIGMA = df_matched[xbjk_SIG_COL].to_numpy()
148+
q2_SIGMA = df_matched[Q2_SIG_COL].to_numpy()
149+
y_SIGMA = df_matched[Y_SIG_COL].to_numpy()
150+
151+
x_ESIGMA = df_matched[xbjk_ESIG_COL].to_numpy()
152+
q2_ESIGMA = df_matched[Q2_ESIG_COL].to_numpy()
153+
y_ESIGMA = df_matched[Y_ESIG_COL].to_numpy()
154+
155+
return {
156+
"n_total": n_total, "n_reco": n_reco, "n_mc": n_mc, "n_matched": n_matched,
157+
"px_mc": px_mc, "py_mc": py_mc, "pz_mc": pz_mc,
158+
"pt_mc": pt_mc, "p_mc": p_mc, "theta_mc": theta_mc, "eta_mc": eta_mc, "phi_mc": phi_mc,
159+
"x_mc": x_mc, "q2_mc": q2_mc, "E_mc": E_mc, "y_mc": y_mc,
160+
"px": px, "py": py, "pz": pz,
161+
"pt": pt, "p": p, "theta": theta, "eta": eta, "phi": phi,
162+
"x": x, "q2": q2, "E_reco": E_reco, "y": y,
163+
"x_DA": x_DA, "q2_DA": q2_DA, "y_DA": y_DA,
164+
"x_JB": x_JB, "q2_JB": q2_JB, "y_JB": y_JB,
165+
"x_SIGMA": x_SIGMA, "q2_SIGMA": q2_SIGMA, "y_SIGMA": y_SIGMA,
166+
"x_ESIGMA": x_ESIGMA, "q2_ESIGMA": q2_ESIGMA, "y_ESIGMA": y_ESIGMA,
167+
}
168+
169+
170+
def plot_reco_vs_true_all(file,
171+
results_dir="results",
172+
reco_methods=None,
173+
reco_Q2=None,
174+
reco_y=None,
175+
reco_x=None,
176+
nbins_q2=200,
177+
nbins_y=100,
178+
cmap="viridis",
179+
figsize=(14, 18)):
180+
181+
if reco_methods is None:
182+
reco_methods = ["Electron Method", "Double Angle", "Jaquet-Blondel", "Sigma", "ESigma"]
183+
if reco_Q2 is None:
184+
reco_Q2 = ["q2", "q2_DA", "q2_JB", "q2_SIGMA", "q2_ESIGMA"]
185+
if reco_y is None:
186+
reco_y = ["y", "y_DA", "y_JB", "y_SIGMA", "y_ESIGMA"]
187+
if reco_x is None:
188+
reco_x = ["x", "x_DA", "x_JB", "x_SIGMA", "x_ESIGMA"]
189+
190+
df = process_and_mask(file)
191+
df = pd.DataFrame(df)
192+
193+
nmethods = len(reco_methods)
194+
if not (len(reco_Q2) == nmethods == len(reco_y) == len(reco_x)):
195+
raise ValueError("reco_methods, reco_Q2, reco_y, reco_x must have the same length")
196+
197+
fig, axes = plt.subplots(nmethods, 3, figsize=figsize, constrained_layout=False)
198+
fig.subplots_adjust(hspace=0.35, wspace=0.45, right=0.92)
199+
200+
for i, method in enumerate(reco_methods):
201+
ax_q2 = axes[i, 0]
202+
ax_y = axes[i, 1]
203+
ax_x = axes[i, 2]
204+
205+
q2_col = reco_Q2[i]
206+
y_col = reco_y[i]
207+
x_col = reco_x[i]
208+
209+
# ---------- Q2 ----------
210+
mask_q2 = np.isfinite(df["q2_mc"]) & np.isfinite(df[q2_col]) & (df["q2_mc"] > 0) & (df[q2_col] > 0)
211+
xq = df.loc[mask_q2, "q2_mc"].to_numpy()
212+
yq = df.loc[mask_q2, q2_col].to_numpy()
213+
214+
if len(xq) > 10:
215+
xedges = np.logspace(np.log10(xq.min()), np.log10(xq.max()), nbins_q2 + 1)
216+
yedges = np.logspace(np.log10(yq.min()), np.log10(yq.max()), nbins_q2 + 1)
217+
im_q2 = ax_q2.hist2d(xq, yq, bins=(xedges, yedges), norm=LogNorm(vmin=1), cmin=1, cmap=cmap)
218+
ax_q2.set_xscale('log'); ax_q2.set_yscale('log')
219+
ax_q2.set_xlim(10e0, 10e2); ax_q2.set_ylim(10e0, 10e2)
220+
ax_q2.set_xlabel(r"$Q^2_{true}$ $(\mathrm{GeV}/c)^2$")
221+
ax_q2.set_ylabel(r"$Q^2_{reco}$ $(\mathrm{GeV}/c)^2$")
222+
ax_q2.set_title(f"{method} — Q²")
223+
line_coords = np.logspace(np.log10(min(xedges[0], yedges[0])), np.log10(max(xedges[-1], yedges[-1])), 100)
224+
ax_q2.plot(line_coords, line_coords, 'r--', linewidth=1.2)
225+
divider = make_axes_locatable(ax_q2)
226+
cax = divider.append_axes("right", size="4%", pad=0.06)
227+
fig.colorbar(im_q2[3], cax=cax).set_label("counts")
228+
else:
229+
ax_q2.text(0.5, 0.5, "Insufficient Q2 data", ha='center', va='center')
230+
231+
# ---------- x ----------
232+
mask_x = np.isfinite(df["x_mc"]) & np.isfinite(df[x_col]) & (df["x_mc"] > 0) & (df[x_col] > 0)
233+
xx = df.loc[mask_x, "x_mc"].to_numpy()
234+
yx = df.loc[mask_x, x_col].to_numpy()
235+
236+
if len(xx) > 10:
237+
xedges_x = np.logspace(np.log10(xx.min()), np.log10(xx.max()), nbins_q2 + 1)
238+
yedges_x = np.logspace(np.log10(yx.min()), np.log10(yx.max()), nbins_q2 + 1)
239+
im_x = ax_x.hist2d(xx, yx, bins=(xedges_x, yedges_x), norm=LogNorm(vmin=1), cmin=1, cmap=cmap)
240+
ax_x.set_xscale('log'); ax_x.set_yscale('log')
241+
ax_x.set_xlim(10e-5, 10e-1); ax_x.set_ylim(10e-5, 10e-1)
242+
ax_x.set_xlabel(r"$x_{true}$"); ax_x.set_ylabel(r"$x_{reco}$")
243+
ax_x.set_title(f"{method} — x")
244+
line_coords = np.logspace(np.log10(min(xedges_x[0], yedges_x[0])), np.log10(max(xedges_x[-1], yedges_x[-1])), 100)
245+
ax_x.plot(line_coords, line_coords, 'r--', linewidth=1.2)
246+
divider = make_axes_locatable(ax_x)
247+
cax = divider.append_axes("right", size="4%", pad=0.06)
248+
fig.colorbar(im_x[3], cax=cax).set_label("counts")
249+
else:
250+
ax_x.text(0.5, 0.5, "Insufficient x data", ha='center', va='center')
251+
252+
# ---------- y ----------
253+
mask_y = np.isfinite(df["y_mc"]) & np.isfinite(df[y_col])
254+
xy = df.loc[mask_y, "y_mc"].to_numpy()
255+
yy = df.loc[mask_y, y_col].to_numpy()
256+
257+
if len(xy) > 10:
258+
im_y = ax_y.hist2d(xy, yy, bins=(nbins_y, nbins_y), range=((0, 1), (0, 1)), norm=LogNorm(vmin=1), cmin=1, cmap=cmap)
259+
ax_y.set_xlim(0, 1); ax_y.set_ylim(0, 1)
260+
ax_y.set_xlabel(r"$y_{true}$"); ax_y.set_ylabel(r"$y_{reco}$")
261+
ax_y.set_title(f"{method} — y")
262+
ax_y.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), 'r--', linewidth=1.0)
263+
divider = make_axes_locatable(ax_y)
264+
cax = divider.append_axes("right", size="4%", pad=0.06)
265+
fig.colorbar(im_y[3], cax=cax).set_label("counts")
266+
else:
267+
ax_y.text(0.5, 0.5, "Insufficient y data", ha='center', va='center')
268+
269+
plt.tight_layout()
270+
271+
# ---------- save to results directory ----------
272+
os.makedirs(results_dir, exist_ok=True)
273+
stem = os.path.splitext(os.path.basename(file))[0] # e.g. "18x275_Reco_data"
274+
out_path = os.path.join(results_dir, f"{stem}_reco_vs_true.png")
275+
plt.savefig(out_path, dpi=200, bbox_inches="tight")
276+
print(f"Saved: {out_path}")
277+
plt.close(fig)
278+
279+
280+
def parse_args():
281+
parser = argparse.ArgumentParser(
282+
description="Plot reco-vs-true kinematic comparisons for DIS Sullivan process events."
283+
)
284+
parser.add_argument(
285+
"file",
286+
help="Path to input .parquet data file"
287+
)
288+
parser.add_argument(
289+
"--results-dir", "-o",
290+
default="results",
291+
help="Directory to write output into (created if absent). Default: ./results"
292+
)
293+
parser.add_argument(
294+
"--nbins-q2", type=int, default=200,
295+
help="Number of bins for Q2 and x 2D histograms. Default: 200"
296+
)
297+
parser.add_argument(
298+
"--nbins-y", type=int, default=100,
299+
help="Number of bins for y 2D histogram. Default: 100"
300+
)
301+
parser.add_argument(
302+
"--cmap", default="viridis",
303+
help="Matplotlib colormap name. Default: viridis"
304+
)
305+
return parser.parse_args()
306+
307+
308+
if __name__ == "__main__":
309+
args = parse_args()
310+
plot_reco_vs_true_all(
311+
file=args.file,
312+
results_dir=args.results_dir,
313+
nbins_q2=args.nbins_q2,
314+
nbins_y=args.nbins_y,
315+
cmap=args.cmap,
316+
)

0 commit comments

Comments
 (0)