Skip to content

Commit cb0f749

Browse files
authored
Add log analysis script for homepage response time percentiles (#73)
1 parent 51447ee commit cb0f749

1 file changed

Lines changed: 208 additions & 0 deletions

File tree

scripts/log.py

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
import re
2+
import random
3+
import argparse
4+
from pathlib import Path
5+
from datetime import datetime
6+
from collections import defaultdict
7+
import numpy as np
8+
import matplotlib.pyplot as plt
9+
10+
# keep max samples per day (adjust if needed)
11+
MAX_SAMPLES = 200000
12+
13+
# Log format looks like:
14+
# [pid: ...] <ip> ... [Tue Feb 10 14:47:06 2026] GET /... => generated ... in 2740 msecs
15+
# Use a greedy prefix so we capture the *last* [...] date block before GET.
16+
pattern = re.compile(
17+
r".*\[(?P<date>(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s+[A-Za-z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4})\]\s+GET\s+(?P<path>/[^\s]*)\s+=> generated .* in (?P<time>\d+) msecs"
18+
)
19+
20+
# reservoir per day
21+
samples = defaultdict(list)
22+
counts = defaultdict(int)
23+
24+
def reservoir_add(day, value):
25+
counts[day] += 1
26+
c = counts[day]
27+
28+
if len(samples[day]) < MAX_SAMPLES:
29+
samples[day].append(value)
30+
else:
31+
# replace elements with decreasing probability
32+
j = random.randint(0, c - 1)
33+
if j < MAX_SAMPLES:
34+
samples[day][j] = value
35+
36+
37+
def _default_log_file() -> Path | None:
38+
downloads = Path.home() / "Downloads"
39+
if not downloads.exists():
40+
return None
41+
# Prefer the exact historical naming scheme, but fall back to any similar file.
42+
matches = list(downloads.glob("orumsV3_spoken.log-*"))
43+
if not matches:
44+
matches = list(downloads.glob("*orumsV3_spoken*"))
45+
matches = sorted(matches, key=lambda p: p.stat().st_mtime, reverse=True)
46+
return matches[0] if matches else None
47+
48+
49+
def _parse_args() -> argparse.Namespace:
50+
parser = argparse.ArgumentParser(description="Compute homepage response-time percentiles from a log file.")
51+
parser.add_argument(
52+
"--log-file",
53+
type=str,
54+
default=None,
55+
help="Path to the log file. If omitted, uses the newest ~/Downloads/orumsV3_spoken.log-* file.",
56+
)
57+
parser.add_argument(
58+
"--max-samples",
59+
type=int,
60+
default=MAX_SAMPLES,
61+
help="Max reservoir samples per day (default: %(default)s).",
62+
)
63+
parser.add_argument(
64+
"--days",
65+
type=str,
66+
default="10,11,12,13,14,15",
67+
help="Comma-separated day-of-month numbers to include for Feb (default: %(default)s).",
68+
)
69+
parser.add_argument(
70+
"--homepage-match",
71+
choices=["strict", "allow-query"],
72+
default="allow-query",
73+
help="Homepage matching: strict='/' only; allow-query='/' and '/?…' (default: %(default)s).",
74+
)
75+
parser.add_argument(
76+
"--plot",
77+
choices=["compare", "hist", "both", "none"],
78+
default="compare",
79+
help="Plot type (default: %(default)s).",
80+
)
81+
return parser.parse_args()
82+
83+
84+
args = _parse_args()
85+
MAX_SAMPLES = args.max_samples
86+
days = {int(x.strip()) for x in args.days.split(",") if x.strip()}
87+
88+
def _is_homepage(path: str) -> bool:
89+
if args.homepage_match == "strict":
90+
return path == "/"
91+
# allow-query
92+
return path == "/" or path.startswith("/?")
93+
94+
log_file = Path(args.log_file).expanduser() if args.log_file else _default_log_file()
95+
if not log_file or not log_file.exists():
96+
downloads = Path.home() / "Downloads"
97+
default_hint = downloads / "orumsV3_spoken.log-*"
98+
raise SystemExit(
99+
"Log file not found.\n"
100+
f"Tried auto-detecting the newest file matching `{default_hint}`.\n"
101+
"Pass it explicitly, e.g.:\n"
102+
" python log.py --log-file ~/Downloads/orumsV3_spoken.log-20260215\n"
103+
)
104+
105+
106+
with open(log_file, "r", errors="ignore") as f:
107+
for line in f:
108+
m = pattern.search(line)
109+
if not m:
110+
continue
111+
112+
path = m.group("path").strip()
113+
114+
# homepage only
115+
if not _is_homepage(path):
116+
continue
117+
118+
raw_date = m.group("date")
119+
response_time = int(m.group("time"))
120+
121+
dt = datetime.strptime(raw_date, "%a %b %d %H:%M:%S %Y")
122+
123+
# only Feb 10–15
124+
if dt.month == 2 and dt.day in days:
125+
day_key = dt.strftime("%Y-%m-%d")
126+
reservoir_add(day_key, response_time)
127+
128+
129+
# compute percentiles & print a comparison table
130+
print("\n📊 Homepage response-time percentiles (Feb comparison)\n")
131+
print(f"Log file: {log_file}")
132+
print(f"Homepage match: {args.homepage_match}")
133+
print("")
134+
135+
ordered_days = [f"2026-02-{d:02d}" for d in sorted(days)]
136+
137+
rows = []
138+
results = {} # day -> np.ndarray (for optional plotting)
139+
140+
for day in ordered_days:
141+
arr_list = samples.get(day, [])
142+
if not arr_list:
143+
rows.append((day, 0, counts.get(day, 0), None, None, None))
144+
continue
145+
146+
arr = np.array(arr_list)
147+
results[day] = arr
148+
149+
p50 = float(np.percentile(arr, 50))
150+
p80 = float(np.percentile(arr, 80))
151+
p90 = float(np.percentile(arr, 90))
152+
rows.append((day, len(arr), counts.get(day, len(arr)), p50, p80, p90))
153+
154+
header = ("Date", "samples_used", "total_seen", "p50_ms", "p80_ms", "p90_ms")
155+
print(f"{header[0]:<12} {header[1]:>12} {header[2]:>10} {header[3]:>10} {header[4]:>10} {header[5]:>10}")
156+
print("-" * 70)
157+
for (day, n, total, p50, p80, p90) in rows:
158+
if p50 is None:
159+
print(f"{day:<12} {n:>12} {total:>10} {'-':>10} {'-':>10} {'-':>10}")
160+
else:
161+
print(f"{day:<12} {n:>12} {total:>10} {p50:>10.1f} {p80:>10.1f} {p90:>10.1f}")
162+
163+
164+
def _plot_histograms() -> None:
165+
for day, arr in results.items():
166+
plt.figure()
167+
plt.hist(arr, bins=40)
168+
plt.title(f"Homepage Response Time Distribution ({day})")
169+
plt.xlabel("Response Time (ms)")
170+
plt.ylabel("Requests")
171+
plt.grid(True)
172+
173+
174+
def _plot_comparison() -> None:
175+
xs = []
176+
p50s = []
177+
p80s = []
178+
p90s = []
179+
for (day, n, _total, p50, p80, p90) in rows:
180+
if n == 0:
181+
continue
182+
xs.append(day)
183+
p50s.append(p50)
184+
p80s.append(p80)
185+
p90s.append(p90)
186+
187+
if not xs:
188+
return
189+
190+
plt.figure()
191+
plt.plot(xs, p50s, marker="o", label="P50")
192+
plt.plot(xs, p80s, marker="o", label="P80")
193+
plt.plot(xs, p90s, marker="o", label="P90")
194+
plt.title("Homepage Response Time Percentiles (Feb 10–15)")
195+
plt.xlabel("Date")
196+
plt.ylabel("Response time (ms)")
197+
plt.grid(True)
198+
plt.legend()
199+
plt.xticks(rotation=45, ha="right")
200+
plt.tight_layout()
201+
202+
203+
if args.plot in ("hist", "both"):
204+
_plot_histograms()
205+
if args.plot in ("compare", "both"):
206+
_plot_comparison()
207+
if args.plot != "none":
208+
plt.show()

0 commit comments

Comments
 (0)