1+ import argparse
2+ import os
3+ import matplotlib .pyplot as plt
4+ import scipy as sp
5+ import numpy as np
6+ import pandas as pd
7+ from matplotlib .colors import LogNorm
8+ from mpl_toolkits .axes_grid1 import make_axes_locatable
9+
10+ # column names expected in concatenated DataFrame (true MC values)
11+ PX_MC_COL = "mc_elec_px"
12+ PY_MC_COL = "mc_elec_py"
13+ PZ_MC_COL = "mc_elec_pz"
14+ xbjk_MC_COL = "mc_x"
15+ Q2_MC_COL = "mc_q2"
16+ Y_MC_COL = "mc_y"
17+
18+ # column names expected in concatenated DataFrame (Reconstructed Values) (Electron Method)
19+ PX_COL = "elec_px"
20+ PY_COL = "elec_py"
21+ PZ_COL = "elec_pz"
22+ xbjk_COL = "electron_x"
23+ Q2_COL = "electron_q2"
24+ E_COL = "elec_energy"
25+ Y_COL = "electron_y"
26+
27+ # Double Angle Method
28+ xbjk_DA_COL = "da_x"
29+ Q2_DA_COL = "da_q2"
30+ Y_DA_COL = "da_y"
31+
32+ # E Sigma Method
33+ xbjk_ESIG_COL = "esigma_x"
34+ Q2_ESIG_COL = "esigma_q2"
35+ Y_ESIG_COL = "esigma_y"
36+
37+ # Jaquet-Blondel Method
38+ xbjk_JB_COL = "jb_x"
39+ Q2_JB_COL = "jb_q2"
40+ Y_JB_COL = "jb_y"
41+
42+ # Sigma Method
43+ xbjk_SIG_COL = "sigma_x"
44+ Q2_SIG_COL = "sigma_q2"
45+ Y_SIG_COL = "sigma_y"
46+
47+
48+ def process_and_mask (file ):
49+ """
50+ Return a dict with numeric arrays for px,py,pz and derived kinematics.
51+ Only events where reconstructed kinematics are present are kept (so MC and reco align).
52+ """
53+
54+ # ---------- load data ----------
55+
56+ suffix = file .lower ()
57+ if ".parquet" or ".parq" in (suffix ):
58+ df = pd .read_parquet (file )
59+ elif ".feather" in (suffix ):
60+ df = pd .read_feather (file )
61+ elif ".csv" or ".txt" in (suffix ):
62+ # read only the columns we need (fast & memory friendly)
63+ df = pd .read_csv (file )
64+ else :
65+ raise ValueError (f"Unsupported acceptance file type: { suffix } " )
66+ df = pd .read_parquet (file , engine = "pyarrow" )
67+ if df is None or df .empty :
68+ print (" -> concatenated dataframe empty or None" )
69+
70+ # ---------- coerce important columns to numeric (in-place) ----------
71+ reco_needed = [PX_COL , PY_COL , PZ_COL , xbjk_COL , Q2_COL , E_COL , Y_COL ]
72+ mc_needed = [PX_MC_COL , PY_MC_COL , PZ_MC_COL , xbjk_MC_COL , Q2_MC_COL , Y_MC_COL ]
73+ DA_needed = [xbjk_DA_COL , Q2_DA_COL , Y_DA_COL ]
74+ ESIGMA_needed = [xbjk_ESIG_COL , Q2_ESIG_COL , Y_ESIG_COL ]
75+ JB_needed = [xbjk_JB_COL , Q2_JB_COL , Y_JB_COL ]
76+ SIGMA_needed = [xbjk_SIG_COL , Q2_SIG_COL , Y_SIG_COL ]
77+
78+ df [reco_needed ] = df [reco_needed ].apply (pd .to_numeric , errors = "coerce" )
79+ df [mc_needed ] = df [mc_needed ].apply (pd .to_numeric , errors = "coerce" )
80+ df [DA_needed ] = df [DA_needed ].apply (pd .to_numeric , errors = "coerce" )
81+ df [ESIGMA_needed ] = df [ESIGMA_needed ].apply (pd .to_numeric , errors = "coerce" )
82+ df [JB_needed ] = df [JB_needed ].apply (pd .to_numeric , errors = "coerce" )
83+ df [SIGMA_needed ] = df [SIGMA_needed ].apply (pd .to_numeric , errors = "coerce" )
84+
85+ # ---------- build masks ----------
86+ reco_complete_mask = df [reco_needed ].notna ().all (axis = 1 )
87+ mc_complete_mask = df [mc_needed ].notna ().all (axis = 1 )
88+ DA_mask = df [DA_needed ].notna ().all (axis = 1 )
89+ ESIGMA_mask = df [ESIGMA_needed ].notna ().all (axis = 1 )
90+ JB_mask = df [JB_needed ].notna ().all (axis = 1 )
91+ SIGMA_mask = df [SIGMA_needed ].notna ().all (axis = 1 )
92+
93+ matched_mask = reco_complete_mask & mc_complete_mask & DA_mask & ESIGMA_mask & JB_mask & SIGMA_mask
94+
95+ n_total = len (df )
96+ n_reco = int (reco_complete_mask .sum ())
97+ n_mc = int (mc_complete_mask .sum ())
98+ n_matched = int (matched_mask .sum ())
99+ print (f"Total events: { n_total } ; reco-complete: { n_reco } ; mc-complete: { n_mc } ; matched: { n_matched } " )
100+
101+ if n_matched == 0 :
102+ print (" -> No matched (reco+mc) events found; returning None" )
103+
104+ df_matched = df .loc [matched_mask ].copy ()
105+
106+ # ---------- MC arrays ----------
107+ px_mc = df_matched [PX_MC_COL ].to_numpy ()
108+ py_mc = df_matched [PY_MC_COL ].to_numpy ()
109+ pz_mc = df_matched [PZ_MC_COL ].to_numpy ()
110+ x_mc = df_matched [xbjk_MC_COL ].to_numpy ()
111+ q2_mc = df_matched [Q2_MC_COL ].to_numpy ()
112+ y_mc = df_matched [Y_MC_COL ].to_numpy ()
113+
114+ pt_mc = np .sqrt (px_mc ** 2 + py_mc ** 2 )
115+ p_mc = np .sqrt (px_mc ** 2 + py_mc ** 2 + pz_mc ** 2 )
116+ E_mc = p_mc .copy ()
117+ with np .errstate (invalid = "ignore" , divide = "ignore" ):
118+ theta_mc = np .arccos (np .clip (pz_mc / p_mc , - 1.0 , 1.0 ))
119+ eta_mc = - np .log (np .tan (theta_mc / 2.0 ))
120+ phi_mc = np .arctan2 (py_mc , px_mc )
121+
122+ # ---------- reco arrays ----------
123+ px = df_matched [PX_COL ].to_numpy ()
124+ py = df_matched [PY_COL ].to_numpy ()
125+ pz = df_matched [PZ_COL ].to_numpy ()
126+ x = df_matched [xbjk_COL ].to_numpy ()
127+ q2 = df_matched [Q2_COL ].to_numpy ()
128+ E_reco = df_matched [E_COL ].to_numpy ()
129+ y = df_matched [Y_COL ].to_numpy ()
130+
131+ pt = np .sqrt (px ** 2 + py ** 2 )
132+ p = np .sqrt (px ** 2 + py ** 2 + pz ** 2 )
133+ with np .errstate (invalid = "ignore" , divide = "ignore" ):
134+ theta = np .arccos (np .clip (pz / p , - 1.0 , 1.0 ))
135+ eta = - np .log (np .tan (theta / 2.0 ))
136+ phi = np .arctan2 (py , px )
137+
138+ # ---------- alternative method arrays ----------
139+ x_DA = df_matched [xbjk_DA_COL ].to_numpy ()
140+ q2_DA = df_matched [Q2_DA_COL ].to_numpy ()
141+ y_DA = df_matched [Y_DA_COL ].to_numpy ()
142+
143+ x_JB = df_matched [xbjk_JB_COL ].to_numpy ()
144+ q2_JB = df_matched [Q2_JB_COL ].to_numpy ()
145+ y_JB = df_matched [Y_JB_COL ].to_numpy ()
146+
147+ x_SIGMA = df_matched [xbjk_SIG_COL ].to_numpy ()
148+ q2_SIGMA = df_matched [Q2_SIG_COL ].to_numpy ()
149+ y_SIGMA = df_matched [Y_SIG_COL ].to_numpy ()
150+
151+ x_ESIGMA = df_matched [xbjk_ESIG_COL ].to_numpy ()
152+ q2_ESIGMA = df_matched [Q2_ESIG_COL ].to_numpy ()
153+ y_ESIGMA = df_matched [Y_ESIG_COL ].to_numpy ()
154+
155+ return {
156+ "n_total" : n_total , "n_reco" : n_reco , "n_mc" : n_mc , "n_matched" : n_matched ,
157+ "px_mc" : px_mc , "py_mc" : py_mc , "pz_mc" : pz_mc ,
158+ "pt_mc" : pt_mc , "p_mc" : p_mc , "theta_mc" : theta_mc , "eta_mc" : eta_mc , "phi_mc" : phi_mc ,
159+ "x_mc" : x_mc , "q2_mc" : q2_mc , "E_mc" : E_mc , "y_mc" : y_mc ,
160+ "px" : px , "py" : py , "pz" : pz ,
161+ "pt" : pt , "p" : p , "theta" : theta , "eta" : eta , "phi" : phi ,
162+ "x" : x , "q2" : q2 , "E_reco" : E_reco , "y" : y ,
163+ "x_DA" : x_DA , "q2_DA" : q2_DA , "y_DA" : y_DA ,
164+ "x_JB" : x_JB , "q2_JB" : q2_JB , "y_JB" : y_JB ,
165+ "x_SIGMA" : x_SIGMA , "q2_SIGMA" : q2_SIGMA , "y_SIGMA" : y_SIGMA ,
166+ "x_ESIGMA" : x_ESIGMA , "q2_ESIGMA" : q2_ESIGMA , "y_ESIGMA" : y_ESIGMA ,
167+ }
168+
169+
170+ def plot_reco_vs_true_all (file ,
171+ results_dir = "results" ,
172+ reco_methods = None ,
173+ reco_Q2 = None ,
174+ reco_y = None ,
175+ reco_x = None ,
176+ nbins_q2 = 200 ,
177+ nbins_y = 100 ,
178+ cmap = "viridis" ,
179+ figsize = (14 , 18 )):
180+
181+ if reco_methods is None :
182+ reco_methods = ["Electron Method" , "Double Angle" , "Jaquet-Blondel" , "Sigma" , "ESigma" ]
183+ if reco_Q2 is None :
184+ reco_Q2 = ["q2" , "q2_DA" , "q2_JB" , "q2_SIGMA" , "q2_ESIGMA" ]
185+ if reco_y is None :
186+ reco_y = ["y" , "y_DA" , "y_JB" , "y_SIGMA" , "y_ESIGMA" ]
187+ if reco_x is None :
188+ reco_x = ["x" , "x_DA" , "x_JB" , "x_SIGMA" , "x_ESIGMA" ]
189+
190+ df = process_and_mask (file )
191+ df = pd .DataFrame (df )
192+
193+ nmethods = len (reco_methods )
194+ if not (len (reco_Q2 ) == nmethods == len (reco_y ) == len (reco_x )):
195+ raise ValueError ("reco_methods, reco_Q2, reco_y, reco_x must have the same length" )
196+
197+ fig , axes = plt .subplots (nmethods , 3 , figsize = figsize , constrained_layout = False )
198+ fig .subplots_adjust (hspace = 0.35 , wspace = 0.45 , right = 0.92 )
199+
200+ for i , method in enumerate (reco_methods ):
201+ ax_q2 = axes [i , 0 ]
202+ ax_y = axes [i , 1 ]
203+ ax_x = axes [i , 2 ]
204+
205+ q2_col = reco_Q2 [i ]
206+ y_col = reco_y [i ]
207+ x_col = reco_x [i ]
208+
209+ # ---------- Q2 ----------
210+ mask_q2 = np .isfinite (df ["q2_mc" ]) & np .isfinite (df [q2_col ]) & (df ["q2_mc" ] > 0 ) & (df [q2_col ] > 0 )
211+ xq = df .loc [mask_q2 , "q2_mc" ].to_numpy ()
212+ yq = df .loc [mask_q2 , q2_col ].to_numpy ()
213+
214+ if len (xq ) > 10 :
215+ xedges = np .logspace (np .log10 (xq .min ()), np .log10 (xq .max ()), nbins_q2 + 1 )
216+ yedges = np .logspace (np .log10 (yq .min ()), np .log10 (yq .max ()), nbins_q2 + 1 )
217+ im_q2 = ax_q2 .hist2d (xq , yq , bins = (xedges , yedges ), norm = LogNorm (vmin = 1 ), cmin = 1 , cmap = cmap )
218+ ax_q2 .set_xscale ('log' ); ax_q2 .set_yscale ('log' )
219+ ax_q2 .set_xlim (10e0 , 10e2 ); ax_q2 .set_ylim (10e0 , 10e2 )
220+ ax_q2 .set_xlabel (r"$Q^2_{true}$ $(\mathrm{GeV}/c)^2$" )
221+ ax_q2 .set_ylabel (r"$Q^2_{reco}$ $(\mathrm{GeV}/c)^2$" )
222+ ax_q2 .set_title (f"{ method } — Q²" )
223+ line_coords = np .logspace (np .log10 (min (xedges [0 ], yedges [0 ])), np .log10 (max (xedges [- 1 ], yedges [- 1 ])), 100 )
224+ ax_q2 .plot (line_coords , line_coords , 'r--' , linewidth = 1.2 )
225+ divider = make_axes_locatable (ax_q2 )
226+ cax = divider .append_axes ("right" , size = "4%" , pad = 0.06 )
227+ fig .colorbar (im_q2 [3 ], cax = cax ).set_label ("counts" )
228+ else :
229+ ax_q2 .text (0.5 , 0.5 , "Insufficient Q2 data" , ha = 'center' , va = 'center' )
230+
231+ # ---------- x ----------
232+ mask_x = np .isfinite (df ["x_mc" ]) & np .isfinite (df [x_col ]) & (df ["x_mc" ] > 0 ) & (df [x_col ] > 0 )
233+ xx = df .loc [mask_x , "x_mc" ].to_numpy ()
234+ yx = df .loc [mask_x , x_col ].to_numpy ()
235+
236+ if len (xx ) > 10 :
237+ xedges_x = np .logspace (np .log10 (xx .min ()), np .log10 (xx .max ()), nbins_q2 + 1 )
238+ yedges_x = np .logspace (np .log10 (yx .min ()), np .log10 (yx .max ()), nbins_q2 + 1 )
239+ im_x = ax_x .hist2d (xx , yx , bins = (xedges_x , yedges_x ), norm = LogNorm (vmin = 1 ), cmin = 1 , cmap = cmap )
240+ ax_x .set_xscale ('log' ); ax_x .set_yscale ('log' )
241+ ax_x .set_xlim (10e-5 , 10e-1 ); ax_x .set_ylim (10e-5 , 10e-1 )
242+ ax_x .set_xlabel (r"$x_{true}$" ); ax_x .set_ylabel (r"$x_{reco}$" )
243+ ax_x .set_title (f"{ method } — x" )
244+ line_coords = np .logspace (np .log10 (min (xedges_x [0 ], yedges_x [0 ])), np .log10 (max (xedges_x [- 1 ], yedges_x [- 1 ])), 100 )
245+ ax_x .plot (line_coords , line_coords , 'r--' , linewidth = 1.2 )
246+ divider = make_axes_locatable (ax_x )
247+ cax = divider .append_axes ("right" , size = "4%" , pad = 0.06 )
248+ fig .colorbar (im_x [3 ], cax = cax ).set_label ("counts" )
249+ else :
250+ ax_x .text (0.5 , 0.5 , "Insufficient x data" , ha = 'center' , va = 'center' )
251+
252+ # ---------- y ----------
253+ mask_y = np .isfinite (df ["y_mc" ]) & np .isfinite (df [y_col ])
254+ xy = df .loc [mask_y , "y_mc" ].to_numpy ()
255+ yy = df .loc [mask_y , y_col ].to_numpy ()
256+
257+ if len (xy ) > 10 :
258+ im_y = ax_y .hist2d (xy , yy , bins = (nbins_y , nbins_y ), range = ((0 , 1 ), (0 , 1 )), norm = LogNorm (vmin = 1 ), cmin = 1 , cmap = cmap )
259+ ax_y .set_xlim (0 , 1 ); ax_y .set_ylim (0 , 1 )
260+ ax_y .set_xlabel (r"$y_{true}$" ); ax_y .set_ylabel (r"$y_{reco}$" )
261+ ax_y .set_title (f"{ method } — y" )
262+ ax_y .plot (np .linspace (0 , 1 , 100 ), np .linspace (0 , 1 , 100 ), 'r--' , linewidth = 1.0 )
263+ divider = make_axes_locatable (ax_y )
264+ cax = divider .append_axes ("right" , size = "4%" , pad = 0.06 )
265+ fig .colorbar (im_y [3 ], cax = cax ).set_label ("counts" )
266+ else :
267+ ax_y .text (0.5 , 0.5 , "Insufficient y data" , ha = 'center' , va = 'center' )
268+
269+ plt .tight_layout ()
270+
271+ # ---------- save to results directory ----------
272+ os .makedirs (results_dir , exist_ok = True )
273+ stem = os .path .splitext (os .path .basename (file ))[0 ] # e.g. "18x275_Reco_data"
274+ out_path = os .path .join (results_dir , f"{ stem } _reco_vs_true.png" )
275+ plt .savefig (out_path , dpi = 200 , bbox_inches = "tight" )
276+ print (f"Saved: { out_path } " )
277+ plt .close (fig )
278+
279+
280+ def parse_args ():
281+ parser = argparse .ArgumentParser (
282+ description = "Plot reco-vs-true kinematic comparisons for DIS Sullivan process events."
283+ )
284+ parser .add_argument (
285+ "file" ,
286+ help = "Path to input .parquet data file"
287+ )
288+ parser .add_argument (
289+ "--results-dir" , "-o" ,
290+ default = "results" ,
291+ help = "Directory to write output into (created if absent). Default: ./results"
292+ )
293+ parser .add_argument (
294+ "--nbins-q2" , type = int , default = 200 ,
295+ help = "Number of bins for Q2 and x 2D histograms. Default: 200"
296+ )
297+ parser .add_argument (
298+ "--nbins-y" , type = int , default = 100 ,
299+ help = "Number of bins for y 2D histogram. Default: 100"
300+ )
301+ parser .add_argument (
302+ "--cmap" , default = "viridis" ,
303+ help = "Matplotlib colormap name. Default: viridis"
304+ )
305+ return parser .parse_args ()
306+
307+
308+ if __name__ == "__main__" :
309+ args = parse_args ()
310+ plot_reco_vs_true_all (
311+ file = args .file ,
312+ results_dir = args .results_dir ,
313+ nbins_q2 = args .nbins_q2 ,
314+ nbins_y = args .nbins_y ,
315+ cmap = args .cmap ,
316+ )
0 commit comments