-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathservice.go
More file actions
2314 lines (2067 loc) · 75.5 KB
/
service.go
File metadata and controls
2314 lines (2067 loc) · 75.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package main
import (
"bytes"
"context"
"embed"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"log"
"net"
"net/http"
"os"
"regexp"
"strconv"
"strings"
"sync"
"time"
)
//go:embed public
var publicFS embed.FS
type Config struct {
ListenAddr string
TrustedProxiesCIDR []string
// ClickHouse (primary telemetry store)
CHDSN string // clickhouse://user:pass@host:9000/telemetry_db
// Limits
MaxBodyBytes int64
RateLimitRPM int // requests per minute per key
RateBurst int // burst tokens
RateKeyMode string // "ip" or "header"
RateKeyHeader string // e.g. "X-Telemetry-Key"
RequestTimeout time.Duration // upstream timeout
EnableReqLogging bool // default false (GDPR-friendly)
// Cache
RedisURL string
EnableRedis bool
CacheTTL time.Duration
CacheEnabled bool
// Alerts (SMTP)
AlertEnabled bool
SMTPHost string
SMTPPort int
SMTPUser string
SMTPPassword string
SMTPFrom string
SMTPTo []string
SMTPUseTLS bool
AlertFailureThreshold float64
AlertCheckInterval time.Duration
AlertCooldown time.Duration
// GitHub Integration
GitHubToken string // Personal access token for creating issues
GitHubOwner string // Repository owner (e.g., "community-scripts")
GitHubRepo string // Repository name (e.g., "ProxmoxVE")
AdminPassword string // Password to protect admin actions (issue creation)
}
// TelemetryIn matches payload from api.func (bash client)
type TelemetryIn struct {
// Required
RandomID string `json:"random_id"` // Session UUID
ExecutionID string `json:"execution_id,omitempty"` // Unique execution ID
Type string `json:"type"` // "lxc", "vm", "turnkey", "pve", "addon"
NSAPP string `json:"nsapp"` // Application name (e.g., "jellyfin")
Status string `json:"status"` // "installing", "success", "failed", "aborted", "unknown"
// Container/VM specs
CTType int `json:"ct_type,omitempty"` // 1=unprivileged, 2=privileged/VM
DiskSize int `json:"disk_size,omitempty"` // GB
CoreCount int `json:"core_count,omitempty"` // CPU cores
RAMSize int `json:"ram_size,omitempty"` // MB
// System info
OsType string `json:"os_type,omitempty"` // "debian", "ubuntu", "alpine", etc.
OsVersion string `json:"os_version,omitempty"` // "12", "24.04", etc.
PveVer string `json:"pve_version,omitempty"`
// Optional
Method string `json:"method,omitempty"` // "default", "advanced"
Error string `json:"error,omitempty"` // Error description (max 120 chars)
ExitCode int `json:"exit_code,omitempty"` // 0-255
// === EXTENDED FIELDS ===
// GPU Passthrough stats
GPUVendor string `json:"gpu_vendor,omitempty"` // "intel", "amd", "nvidia"
GPUModel string `json:"gpu_model,omitempty"` // e.g., "Intel Arc Graphics"
GPUPassthrough string `json:"gpu_passthrough,omitempty"` // "igpu", "dgpu", "vgpu", "none"
// CPU stats
CPUVendor string `json:"cpu_vendor,omitempty"` // "intel", "amd", "arm"
CPUModel string `json:"cpu_model,omitempty"` // e.g., "Intel Core Ultra 7 155H"
// RAM stats
RAMSpeed string `json:"ram_speed,omitempty"` // e.g., "4800" (MT/s)
// Performance metrics
InstallDuration int `json:"install_duration,omitempty"` // Seconds
// Error categorization
ErrorCategory string `json:"error_category,omitempty"` // "network", "storage", "dependency", "permission", "timeout", "unknown"
// Repository source for collection routing
RepoSource string `json:"repo_source,omitempty"` // "ProxmoxVE", "ProxmoxVED", or "external"
}
// TelemetryOut is the output shape for telemetry records
type TelemetryOut struct {
RandomID string `json:"random_id"`
ExecutionID string `json:"execution_id,omitempty"`
Type string `json:"type"`
NSAPP string `json:"nsapp"`
Status string `json:"status"`
CTType int `json:"ct_type,omitempty"`
DiskSize int `json:"disk_size,omitempty"`
CoreCount int `json:"core_count,omitempty"`
RAMSize int `json:"ram_size,omitempty"`
OsType string `json:"os_type,omitempty"`
OsVersion string `json:"os_version,omitempty"`
PveVer string `json:"pve_version,omitempty"`
Method string `json:"method,omitempty"`
Error string `json:"error,omitempty"`
ExitCode int `json:"exit_code,omitempty"`
// Extended fields
GPUVendor string `json:"gpu_vendor,omitempty"`
GPUModel string `json:"gpu_model,omitempty"`
GPUPassthrough string `json:"gpu_passthrough,omitempty"`
CPUVendor string `json:"cpu_vendor,omitempty"`
CPUModel string `json:"cpu_model,omitempty"`
RAMSpeed string `json:"ram_speed,omitempty"`
InstallDuration int `json:"install_duration,omitempty"`
ErrorCategory string `json:"error_category,omitempty"`
// Repository source: "ProxmoxVE", "ProxmoxVED", or "external"
RepoSource string `json:"repo_source,omitempty"`
}
// TelemetryStatusUpdate contains only fields needed for status updates
type TelemetryStatusUpdate struct {
Status string `json:"status"`
ExecutionID string `json:"execution_id,omitempty"`
Error string `json:"error,omitempty"`
ExitCode int `json:"exit_code"`
InstallDuration int `json:"install_duration,omitempty"`
ErrorCategory string `json:"error_category,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
GPUModel string `json:"gpu_model,omitempty"`
GPUPassthrough string `json:"gpu_passthrough,omitempty"`
CPUVendor string `json:"cpu_vendor,omitempty"`
CPUModel string `json:"cpu_model,omitempty"`
RAMSpeed string `json:"ram_speed,omitempty"`
RepoSource string `json:"repo_source,omitempty"`
}
// Allowed values for 'repo_source' field
var allowedRepoSource = map[string]bool{
"ProxmoxVE": true,
"ProxmoxVED": true,
"external": true,
}
// ---------- Write-Ahead Queue ----------
// Decouples HTTP accept from ClickHouse write. The /telemetry handler enqueues
// work and returns 202 immediately. A pool of workers drains the queue with retries.
// WriteItem is a single telemetry payload queued for ClickHouse write.
type WriteItem struct {
Payload TelemetryOut
Attempt int
EnqueueAt time.Time
}
// WriteQueue buffers telemetry writes and processes them via worker goroutines.
type WriteQueue struct {
ch chan WriteItem
client *CHClient
workers int
maxRetry int
index *ExecIndex // in-memory execution_id dedup
}
// NewWriteQueue creates a buffered write queue with the given capacity and worker count.
func NewWriteQueue(client *CHClient, capacity, workers int, index *ExecIndex) *WriteQueue {
wq := &WriteQueue{
ch: make(chan WriteItem, capacity),
client: client,
workers: workers,
maxRetry: 3,
index: index,
}
return wq
}
// Start launches the worker goroutines.
func (wq *WriteQueue) Start() {
for i := 0; i < wq.workers; i++ {
go wq.worker(i)
}
log.Printf("[QUEUE] Started %d write workers (buffer=%d)", wq.workers, cap(wq.ch))
}
// Enqueue adds a payload to the write queue. Returns false if the queue is full.
func (wq *WriteQueue) Enqueue(payload TelemetryOut) bool {
select {
case wq.ch <- WriteItem{Payload: payload, Attempt: 0, EnqueueAt: time.Now()}:
return true
default:
return false
}
}
// Len returns the current queue depth.
func (wq *WriteQueue) Len() int {
return len(wq.ch)
}
func (wq *WriteQueue) worker(id int) {
for item := range wq.ch {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
err := wq.processItem(ctx, item)
cancel()
if err != nil {
item.Attempt++
if item.Attempt < wq.maxRetry {
// Exponential backoff: 1s, 2s, 4s
backoff := time.Duration(1<<uint(item.Attempt)) * time.Second
time.Sleep(backoff)
// Re-enqueue for retry (non-blocking — drop if queue full)
select {
case wq.ch <- item:
default:
log.Printf("[QUEUE] worker %d: retry queue full, dropping nsapp=%s status=%s exec=%s (attempt %d)",
id, item.Payload.NSAPP, item.Payload.Status, item.Payload.ExecutionID, item.Attempt)
}
} else {
log.Printf("[QUEUE] worker %d: final failure nsapp=%s status=%s exec=%s: %v",
id, item.Payload.NSAPP, item.Payload.Status, item.Payload.ExecutionID, err)
}
}
}
}
// processItem performs the ClickHouse INSERT.
// Every event is a new row in ClickHouse (append-only). There is no find+update — every event is a new row.
// The stats MV only counts terminal statuses, so "installing" events don't inflate counts.
func (wq *WriteQueue) processItem(ctx context.Context, item WriteItem) error {
payload := item.Payload
// Dedup: skip duplicate "installing" events for the same execution_id
if payload.Status == "installing" && payload.ExecutionID != "" {
if _, found := wq.index.Get(payload.ExecutionID); found {
return nil
}
}
// INSERT into ClickHouse (all events — installing, configuring, success, failed, etc.)
if err := wq.client.InsertTelemetry(ctx, payload); err != nil {
return err
}
// Update in-memory index
if payload.ExecutionID != "" {
switch payload.Status {
case "installing":
wq.index.Set(payload.ExecutionID, payload.ExecutionID)
case "success", "failed", "aborted", "unknown":
wq.index.Delete(payload.ExecutionID)
}
}
return nil
}
// ---------- In-Memory Execution ID Index ----------
// Maps execution_id → PB record_id to avoid repeated FindRecord calls.
type ExecIndex struct {
m sync.Map
}
func NewExecIndex() *ExecIndex {
return &ExecIndex{}
}
func (idx *ExecIndex) Get(executionID string) (string, bool) {
v, ok := idx.m.Load(executionID)
if !ok {
return "", false
}
return v.(string), true
}
func (idx *ExecIndex) Set(executionID, recordID string) {
if executionID != "" && recordID != "" {
idx.m.Store(executionID, recordID)
}
}
func (idx *ExecIndex) Delete(executionID string) {
idx.m.Delete(executionID)
}
// -------- Rate limiter (token bucket / minute window, simple) --------
type bucket struct {
tokens int
reset time.Time
}
type RateLimiter struct {
mu sync.Mutex
buckets map[string]*bucket
rpm int
burst int
window time.Duration
cleanInt time.Duration
}
func NewRateLimiter(rpm, burst int) *RateLimiter {
rl := &RateLimiter{
buckets: make(map[string]*bucket),
rpm: rpm,
burst: burst,
window: time.Minute,
cleanInt: 5 * time.Minute,
}
go rl.cleanupLoop()
return rl
}
func (r *RateLimiter) cleanupLoop() {
t := time.NewTicker(r.cleanInt)
defer t.Stop()
for range t.C {
now := time.Now()
r.mu.Lock()
for k, b := range r.buckets {
if now.After(b.reset.Add(2 * r.window)) {
delete(r.buckets, k)
}
}
r.mu.Unlock()
}
}
func (r *RateLimiter) Allow(key string) bool {
if r.rpm <= 0 {
return true
}
now := time.Now()
r.mu.Lock()
defer r.mu.Unlock()
b, ok := r.buckets[key]
if !ok || now.After(b.reset) {
r.buckets[key] = &bucket{tokens: min(r.burst, r.rpm), reset: now.Add(r.window)}
b = r.buckets[key]
}
if b.tokens <= 0 {
return false
}
b.tokens--
return true
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
// -------- Utility: GDPR-safe key extraction --------
type ProxyTrust struct {
nets []*net.IPNet
}
func NewProxyTrust(cidrs []string) (*ProxyTrust, error) {
var nets []*net.IPNet
for _, c := range cidrs {
_, n, err := net.ParseCIDR(strings.TrimSpace(c))
if err != nil {
return nil, err
}
nets = append(nets, n)
}
return &ProxyTrust{nets: nets}, nil
}
func (pt *ProxyTrust) isTrusted(ip net.IP) bool {
for _, n := range pt.nets {
if n.Contains(ip) {
return true
}
}
return false
}
// isPrivateIP returns true if the IP is in RFC 1918 / RFC 6598 private ranges
// (10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16, 100.64.0.0/10) or loopback.
// These are always trusted as reverse proxy sources.
func isPrivateIP(ip net.IP) bool {
if ip.IsLoopback() {
return true
}
privateRanges := []struct {
start net.IP
end net.IP
}{
{net.ParseIP("10.0.0.0"), net.ParseIP("10.255.255.255")},
{net.ParseIP("172.16.0.0"), net.ParseIP("172.31.255.255")},
{net.ParseIP("192.168.0.0"), net.ParseIP("192.168.255.255")},
{net.ParseIP("100.64.0.0"), net.ParseIP("100.127.255.255")},
}
ip4 := ip.To4()
if ip4 == nil {
return false
}
for _, r := range privateRanges {
if bytes.Compare(ip4, r.start.To4()) >= 0 && bytes.Compare(ip4, r.end.To4()) <= 0 {
return true
}
}
return false
}
func getClientIP(r *http.Request, pt *ProxyTrust) net.IP {
// If behind reverse proxy, trust X-Forwarded-For if remote is a configured
// trusted proxy OR a private/RFC1918 IP (common Docker/K8s/reverse proxy setup).
host, _, _ := net.SplitHostPort(r.RemoteAddr)
remote := net.ParseIP(host)
if remote == nil {
return nil
}
trusted := (pt != nil && pt.isTrusted(remote)) || isPrivateIP(remote)
if trusted {
xff := r.Header.Get("X-Forwarded-For")
if xff != "" {
parts := strings.Split(xff, ",")
ip := net.ParseIP(strings.TrimSpace(parts[0]))
if ip != nil {
return ip
}
}
}
return remote
}
// -------- Validation (strict allowlist) --------
var (
// Allowed values for 'type' field
allowedType = map[string]bool{"lxc": true, "vm": true, "turnkey": true, "pve": true, "addon": true, "tool": true}
// Allowed values for 'status' field
allowedStatus = map[string]bool{"installing": true, "validation": true, "configuring": true, "success": true, "failed": true, "aborted": true, "unknown": true}
// Allowed values for 'os_type' field
allowedOsType = map[string]bool{
"debian": true, "ubuntu": true, "alpine": true, "devuan": true,
"fedora": true, "rocky": true, "alma": true, "centos": true,
"opensuse": true, "gentoo": true, "openeuler": true,
// VM-specific OS types
"homeassistant": true, "opnsense": true, "openwrt": true,
"mikrotik": true, "umbrel-os": true, "pimox-haos": true,
"owncloud": true, "turnkey-nextcloud": true, "arch-linux": true,
}
// Allowed values for 'gpu_vendor' field
allowedGPUVendor = map[string]bool{"intel": true, "amd": true, "nvidia": true, "unknown": true, "": true}
// Allowed values for 'gpu_passthrough' field
allowedGPUPassthrough = map[string]bool{"igpu": true, "dgpu": true, "vgpu": true, "none": true, "unknown": true, "": true}
// Allowed values for 'cpu_vendor' field
allowedCPUVendor = map[string]bool{"intel": true, "amd": true, "arm": true, "apple": true, "qualcomm": true, "unknown": true, "": true}
// Allowed values for 'error_category' field
allowedErrorCategory = map[string]bool{
"network": true, "storage": true, "dependency": true, "permission": true,
"timeout": true, "config": true, "resource": true, "unknown": true, "": true,
"user_aborted": true, "apt": true, "command_not_found": true,
"service": true, "database": true, "signal": true, "proxmox": true,
"shell": true, "build": true, "preflight": true,
}
// exitCodeInfo consolidates description and category for all known exit codes.
// This is the single source of truth — dashboard.go and all other code should
// use getExitCodeDescription() / getExitCodeCategory() instead of duplicating.
exitCodeInfo = map[int]struct {
Desc string
Category string
}{
// --- Generic / Shell ---
0: {"Success", ""},
1: {"General error", "unknown"},
2: {"Misuse of shell builtins", "unknown"},
3: {"General syntax or argument error", "unknown"},
// --- curl / wget ---
4: {"curl: Feature not supported or protocol error", "network"},
5: {"curl: Could not resolve proxy", "network"},
6: {"curl: DNS resolution failed", "network"},
7: {"curl: Connection refused / host down", "network"},
8: {"curl: Server reply error", "network"},
16: {"curl: HTTP/2 framing layer error", "network"},
18: {"curl: Partial file (transfer incomplete)", "network"},
22: {"curl: HTTP error (404/500 etc.)", "network"},
23: {"curl: Write error (disk full?)", "storage"},
24: {"curl: Write to local file failed", "storage"},
25: {"curl: Upload failed", "network"},
26: {"curl: Read error on local file (I/O)", "storage"},
27: {"curl: Out of memory", "resource"},
28: {"curl: Connection timed out", "timeout"},
30: {"curl: FTP port command failed", "network"},
32: {"curl: FTP SIZE command failed", "network"},
33: {"curl: HTTP range error", "network"},
34: {"curl: HTTP post error", "network"},
35: {"curl: SSL/TLS handshake failed", "network"},
36: {"curl: FTP bad download resume", "network"},
47: {"curl: Too many redirects", "network"},
51: {"curl: SSL peer certificate verification failed", "network"},
52: {"curl: Empty reply from server", "network"},
55: {"curl: Failed sending network data", "network"},
56: {"curl: Receive error (connection reset)", "network"},
59: {"curl: Couldn't use specified SSL cipher", "network"},
75: {"Temporary failure (retry later)", "network"},
78: {"curl: Remote file not found (404)", "network"},
92: {"curl: HTTP/2 stream error", "network"},
95: {"curl: HTTP/3 layer error", "network"},
// --- Docker / Privileged ---
10: {"Docker / privileged mode required", "config"},
// --- BSD sysexits.h (64-78) ---
64: {"Usage error (wrong arguments)", "config"},
65: {"Data format error (bad input data)", "unknown"},
66: {"Input file not found", "unknown"},
67: {"User not found", "unknown"},
68: {"Host not found", "network"},
69: {"Service unavailable", "service"},
70: {"Internal software error", "unknown"},
71: {"System error (OS-level failure)", "unknown"},
72: {"Critical OS file missing", "unknown"},
73: {"Cannot create output file", "storage"},
74: {"I/O error", "storage"},
76: {"Remote protocol error", "network"},
77: {"Permission denied", "permission"},
// --- APT / DPKG ---
100: {"APT: Package manager error (broken packages)", "apt"},
101: {"APT: Configuration error (bad sources)", "apt"},
102: {"APT: Lock held by another process", "apt"},
// --- Script Validation & Setup (103-123) ---
103: {"Validation: Shell is not Bash", "preflight"},
104: {"Validation: Not running as root", "preflight"},
105: {"Validation: PVE version not supported", "preflight"},
106: {"Validation: Architecture not supported (ARM/PiMox)", "preflight"},
107: {"Validation: Kernel key parameters unreadable", "preflight"},
108: {"Validation: Kernel key limits exceeded", "preflight"},
109: {"Proxmox: No available container ID", "proxmox"},
110: {"Proxmox: Failed to apply default.vars", "proxmox"},
111: {"Proxmox: App defaults file not available", "proxmox"},
112: {"Proxmox: Invalid install menu option", "config"},
113: {"LXC: Under-provisioned — user aborted", "user_aborted"},
114: {"LXC: Storage too low — user aborted", "user_aborted"},
115: {"Download: install.func failed or incomplete", "network"},
116: {"Proxmox: Default bridge vmbr0 not found", "config"},
117: {"LXC: Container did not reach running state", "proxmox"},
118: {"LXC: No IP assigned after timeout", "timeout"},
119: {"Proxmox: No valid storage for rootdir", "storage"},
120: {"Proxmox: No valid storage for vztmpl", "storage"},
121: {"LXC: Container network not ready", "network"},
122: {"LXC: No internet — user declined", "user_aborted"},
123: {"LXC: Local IP detection failed", "network"},
// --- Common shell/system errors ---
124: {"Command timed out", "timeout"},
125: {"Docker daemon error / command failed to start", "config"},
126: {"Command cannot execute (permission problem)", "permission"},
127: {"Command not found", "command_not_found"},
128: {"Invalid argument to exit", "signal"},
129: {"Killed by SIGHUP (terminal closed)", "user_aborted"},
130: {"Script terminated by Ctrl+C (SIGINT)", "user_aborted"},
131: {"Killed by SIGQUIT (core dump)", "signal"},
132: {"Killed by SIGILL (illegal instruction)", "signal"},
134: {"Process aborted (SIGABRT)", "signal"},
137: {"Process killed (SIGKILL) — likely OOM", "resource"},
139: {"Segmentation fault (SIGSEGV)", "unknown"},
141: {"Broken pipe (SIGPIPE)", "signal"},
143: {"Process terminated (SIGTERM)", "signal"},
144: {"Killed by signal 16 (SIGUSR1/SIGSTKFLT)", "signal"},
146: {"Killed by signal 18 (SIGTSTP)", "signal"},
// --- Systemd / Service errors (150-154) ---
150: {"Systemd: Service failed to start", "service"},
151: {"Systemd: Service unit not found", "service"},
152: {"Permission denied (EACCES)", "permission"},
153: {"Build/compile failed (make/gcc/cmake)", "build"},
154: {"Node.js: Native addon build failed (node-gyp)", "build"},
// --- Python / pip / uv (160-162) ---
160: {"Python: Virtualenv/uv environment missing or broken", "dependency"},
161: {"Python: Dependency resolution failed", "dependency"},
162: {"Python: Installation aborted (EXTERNALLY-MANAGED)", "dependency"},
// --- PostgreSQL (170-173) ---
170: {"PostgreSQL: Connection failed", "database"},
171: {"PostgreSQL: Authentication failed", "database"},
172: {"PostgreSQL: Database does not exist", "database"},
173: {"PostgreSQL: Fatal error in query", "database"},
// --- MySQL / MariaDB (180-183) ---
180: {"MySQL/MariaDB: Connection failed", "database"},
181: {"MySQL/MariaDB: Authentication failed", "database"},
182: {"MySQL/MariaDB: Database does not exist", "database"},
183: {"MySQL/MariaDB: Fatal error in query", "database"},
// --- MongoDB (190-193) ---
190: {"MongoDB: Connection failed", "database"},
191: {"MongoDB: Authentication failed", "database"},
192: {"MongoDB: Database not found", "database"},
193: {"MongoDB: Fatal query error", "database"},
// --- Proxmox Custom Codes (200-231) ---
200: {"Proxmox: Failed to create lock file", "proxmox"},
203: {"Proxmox: Missing CTID variable", "config"},
204: {"Proxmox: Missing PCT_OSTYPE variable", "config"},
205: {"Proxmox: Invalid CTID (<100)", "config"},
206: {"Proxmox: CTID already in use", "config"},
207: {"Proxmox: Password contains unescaped special chars", "config"},
208: {"Proxmox: Invalid configuration (DNS/MAC/Network)", "config"},
209: {"Proxmox: Container creation failed", "proxmox"},
210: {"Proxmox: Cluster not quorate", "proxmox"},
211: {"Proxmox: Timeout waiting for template lock", "timeout"},
212: {"Proxmox: Storage 'iscsidirect' does not support containers", "proxmox"},
213: {"Proxmox: Storage does not support 'rootdir' content", "proxmox"},
214: {"Proxmox: Not enough storage space", "storage"},
215: {"Proxmox: Container created but not listed (ghost state)", "proxmox"},
216: {"Proxmox: RootFS entry missing in config", "proxmox"},
217: {"Proxmox: Storage not accessible", "storage"},
218: {"Proxmox: Template file corrupted or incomplete", "proxmox"},
219: {"Proxmox: CephFS does not support containers", "storage"},
220: {"Proxmox: Unable to resolve template path", "proxmox"},
221: {"Proxmox: Template file not readable", "proxmox"},
222: {"Proxmox: Template download failed", "proxmox"},
223: {"Proxmox: Template not available after download", "proxmox"},
224: {"Proxmox: PBS storage is for backups only", "storage"},
225: {"Proxmox: No template available for OS/Version", "proxmox"},
226: {"Proxmox: VM disk import or post-creation setup failed", "proxmox"},
231: {"Proxmox: LXC stack upgrade failed", "proxmox"},
// --- Tools & Addon Scripts (232-238) ---
232: {"Tools: Wrong execution environment", "config"},
233: {"Tools: Application not installed (update prerequisite missing)", "config"},
234: {"Tools: No LXC containers found", "proxmox"},
235: {"Tools: Backup or restore operation failed", "storage"},
236: {"Tools: Required hardware not detected", "config"},
237: {"Tools: Dependency package installation failed", "dependency"},
238: {"Tools: OS or distribution not supported", "config"},
// --- Node.js / npm (239-249) ---
239: {"npm/Node.js: Unexpected runtime error", "dependency"},
243: {"Node.js: Out of memory (heap overflow)", "resource"},
245: {"Node.js: Invalid command-line option", "config"},
246: {"Node.js: Internal JavaScript Parse Error", "unknown"},
247: {"Node.js: Fatal internal error", "unknown"},
248: {"Node.js: Invalid C++ addon / N-API failure", "unknown"},
249: {"npm/pnpm/yarn: Unknown fatal error", "unknown"},
// --- Application Install/Update Errors (250-254) ---
250: {"App: Download failed or version not determined", "network"},
251: {"App: File extraction failed (corrupt/incomplete)", "storage"},
252: {"App: Required file or resource not found", "unknown"},
253: {"App: Data migration required — update aborted", "config"},
254: {"App: User declined prompt or input timed out", "user_aborted"},
// --- DPKG ---
255: {"DPKG: Fatal internal error / set -e triggered", "apt"},
}
)
// getExitCodeDescription returns the human-readable description for an exit code.
// Falls back to signal-based description for codes 128-191, or "Unknown" otherwise.
func getExitCodeDescription(code int) string {
if info, ok := exitCodeInfo[code]; ok {
return info.Desc
}
if code > 128 && code < 192 {
sigNum := code - 128
sigNames := map[int]string{
1: "SIGHUP", 2: "SIGINT", 3: "SIGQUIT", 6: "SIGABRT",
9: "SIGKILL", 11: "SIGSEGV", 13: "SIGPIPE", 15: "SIGTERM",
24: "SIGXCPU", 25: "SIGXFSZ",
}
if name, ok := sigNames[sigNum]; ok {
return fmt.Sprintf("Killed by %s (signal %d)", name, sigNum)
}
return fmt.Sprintf("Killed by signal %d", sigNum)
}
return fmt.Sprintf("Unknown (exit code %d)", code)
}
// getExitCodeCategory returns the error category for an exit code.
// Falls back to "signal" for codes 128-191, or "unknown" otherwise.
func getExitCodeCategory(code int) string {
if info, ok := exitCodeInfo[code]; ok {
return info.Category
}
if code > 128 && code < 192 {
return "signal"
}
return "unknown"
}
func sanitizeShort(s string, max int) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
// remove line breaks and high-risk chars
s = strings.ReplaceAll(s, "\n", " ")
s = strings.ReplaceAll(s, "\r", " ")
if len(s) > max {
s = s[:max]
}
return s
}
// sanitizeMultiLine allows newlines (for log output) but limits total length.
func sanitizeMultiLine(s string, max int) string {
s = strings.TrimSpace(s)
if s == "" {
return ""
}
s = strings.ReplaceAll(s, "\r\n", "\n")
s = strings.ReplaceAll(s, "\r", "\n")
if len(s) > max {
s = s[:max]
}
return s
}
// ipv4Re matches IPv4 addresses (e.g. 192.168.1.100) for GDPR anonymization.
var ipv4Re = regexp.MustCompile(`(\d{1,3}\.)\d{1,3}\.\d{1,3}`)
// numericFieldSuffixRe matches JSON numeric fields with unit suffixes like 32G, 100M.
// Captures: "field_name": 123G → replace with "field_name": 123
var numericFieldSuffixRe = regexp.MustCompile(`("(?:disk_size|core_count|ram_size)"\s*:\s*)(\d+)[A-Za-z]+`)
// hexEscapeRe matches invalid \xNN hex escapes in JSON strings (JSON only supports \uNNNN).
var hexEscapeRe = regexp.MustCompile(`\\x([0-9A-Fa-f]{2})`)
// sanitizeRawJSON attempts to fix common JSON encoding issues from bash clients.
// Called only when the initial json.Decode fails — this is a best-effort rescue.
func sanitizeRawJSON(raw []byte) []byte {
// 1. Strip unit suffixes from numeric fields: "disk_size": 32G → "disk_size": 32
raw = numericFieldSuffixRe.ReplaceAll(raw, []byte("${1}${2}"))
// 2. Replace \xNN hex escapes with \u00NN (valid JSON unicode escapes)
raw = hexEscapeRe.ReplaceAll(raw, []byte(`\u00$1`))
// 3. Replace literal control characters inside strings with safe alternatives
// AND fix invalid escape sequences (e.g. \_ \G \P -> \\_ \\G \\P)
// Valid JSON escapes: \" \\ \/ \b \f \n \r \t \uXXXX
inString := false
escaped := false
cleaned := make([]byte, 0, len(raw))
for _, b := range raw {
if escaped {
escaped = false
// Check if this is a valid JSON escape character
switch b {
case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
cleaned = append(cleaned, b)
case 'u':
// \uXXXX: pass through (hex digits validated by JSON decoder)
cleaned = append(cleaned, b)
default:
// Invalid escape like \_ or \G: double the backslash so \G -> \\G
cleaned = append(cleaned, '\\', b)
}
continue
}
if b == '\\' && inString {
escaped = true
cleaned = append(cleaned, b)
continue
}
if b == '"' {
inString = !inString
cleaned = append(cleaned, b)
continue
}
if inString && b < 0x20 {
// Replace control characters with space (\n and \r are common, others rare)
switch b {
case '\n':
cleaned = append(cleaned, '\\', 'n')
case '\r':
cleaned = append(cleaned, '\\', 'r')
case '\t':
cleaned = append(cleaned, '\\', 't')
default:
cleaned = append(cleaned, ' ')
}
continue
}
cleaned = append(cleaned, b)
}
return cleaned
}
// rescueBrokenJSON is the last-resort handler for telemetry payloads where the error
// field contains unescaped characters (typically " from awk/mawk gsub inconsistencies)
// that broke JSON parsing and couldn't be fixed by sanitizeRawJSON.
//
// Strategy: extract all simple/numeric fields via regex (they're always clean),
// then find the error field boundaries and properly re-encode its content.
func rescueBrokenJSON(raw []byte) (TelemetryIn, error) {
var in TelemetryIn
s := string(raw)
// Helper: extract a simple JSON string value (no escaped quotes in value)
strField := func(name string) string {
re := regexp.MustCompile(`"` + regexp.QuoteMeta(name) + `"\s*:\s*"([^"]*)"`)
m := re.FindStringSubmatch(s)
if len(m) >= 2 {
return m[1]
}
return ""
}
// Helper: extract a JSON integer value
intField := func(name string) int {
re := regexp.MustCompile(`"` + regexp.QuoteMeta(name) + `"\s*:\s*(-?\d+)`)
m := re.FindStringSubmatch(s)
if len(m) >= 2 {
n, _ := strconv.Atoi(m[1])
return n
}
return 0
}
// Extract simple fields (these never contain problematic characters)
in.RandomID = strField("random_id")
in.ExecutionID = strField("execution_id")
in.Type = strField("type")
in.NSAPP = strField("nsapp")
in.Status = strField("status")
in.OsType = strField("os_type")
in.OsVersion = strField("os_version")
in.PveVer = strField("pve_version")
in.Method = strField("method")
in.ErrorCategory = strField("error_category")
in.RepoSource = strField("repo_source")
in.GPUVendor = strField("gpu_vendor")
in.GPUModel = strField("gpu_model")
in.GPUPassthrough = strField("gpu_passthrough")
in.CPUVendor = strField("cpu_vendor")
in.CPUModel = strField("cpu_model")
in.RAMSpeed = strField("ram_speed")
in.CTType = intField("ct_type")
in.DiskSize = intField("disk_size")
in.CoreCount = intField("core_count")
in.RAMSize = intField("ram_size")
in.ExitCode = intField("exit_code")
in.InstallDuration = intField("install_duration")
// Validate that we got the minimum required fields
if in.RandomID == "" || in.NSAPP == "" || in.Status == "" {
return in, fmt.Errorf("rescue failed: missing required fields (random_id=%q, nsapp=%q, status=%q)", in.RandomID, in.NSAPP, in.Status)
}
// Extract the error field: the one with broken escaping.
// Find "error": " and then locate the boundary by searching for the next
// known field pattern: ",\n followed by "error_category" or similar.
errorRe := regexp.MustCompile(`"error"\s*:\s*"`)
eloc := errorRe.FindStringIndex(s)
if eloc != nil {
valueStart := eloc[1]
// Find the boundary: the next known field after "error"
// Pattern in the JSON: ...error content...",\n "error_category": "..."
boundaryFields := []string{"error_category", "install_duration", "cpu_vendor", "gpu_vendor", "ram_speed", "repo_source"}
endPos := -1
for _, field := range boundaryFields {
re := regexp.MustCompile(`",?\s*\n\s*"` + regexp.QuoteMeta(field) + `"`)
m := re.FindStringIndex(s[valueStart:])
if m != nil && (endPos < 0 || m[0] < endPos) {
endPos = m[0]
}
}
// Also check for end-of-object after error (in minimal payloads)
if endPos < 0 {
re := regexp.MustCompile(`",?\s*\n?\s*}`)
m := re.FindStringIndex(s[valueStart:])
if m != nil {
endPos = m[0]
}
}
if endPos > 0 {
rawError := s[valueStart : valueStart+endPos]
// Truncate to 16KB to keep it manageable (the full log was likely 120KB)
if len(rawError) > 16384 {
rawError = rawError[:16384] + "... [truncated by server rescue]"
}
// The raw error has broken escaping: unescape what we can, then store as plain text
rawError = strings.ReplaceAll(rawError, "\\n", "\n")
rawError = strings.ReplaceAll(rawError, "\\t", "\t")
rawError = strings.ReplaceAll(rawError, `\\`, `\`)
rawError = strings.ReplaceAll(rawError, `\"`, `"`)
in.Error = rawError
}
}
return in, nil
}
// sanitizeIPs anonymizes IPv4 addresses in log text for GDPR compliance.
// Keeps the first octet visible for debugging (e.g. "192.x.x"), strips the rest.
func sanitizeIPs(s string) string {
if s == "" {
return s
}
return ipv4Re.ReplaceAllString(s, "${1}x.x")
}
func validate(in *TelemetryIn) error {
// Sanitize all string fields
in.RandomID = sanitizeShort(in.RandomID, 64)
in.ExecutionID = sanitizeShort(in.ExecutionID, 64)
in.Type = sanitizeShort(in.Type, 8)
in.NSAPP = sanitizeShort(in.NSAPP, 64)
in.Status = sanitizeShort(in.Status, 16)
in.OsType = sanitizeShort(in.OsType, 32)
in.OsVersion = sanitizeShort(in.OsVersion, 32)
in.PveVer = sanitizeShort(in.PveVer, 32)
in.Method = sanitizeShort(in.Method, 32)
// Sanitize extended fields
in.GPUVendor = strings.ToLower(sanitizeShort(in.GPUVendor, 16))
in.GPUModel = sanitizeShort(in.GPUModel, 64)
in.GPUPassthrough = strings.ToLower(sanitizeShort(in.GPUPassthrough, 16))
in.CPUVendor = strings.ToLower(sanitizeShort(in.CPUVendor, 16))
in.CPUModel = sanitizeShort(in.CPUModel, 64)
in.RAMSpeed = sanitizeShort(in.RAMSpeed, 16)
in.ErrorCategory = strings.ToLower(sanitizeShort(in.ErrorCategory, 32))
// Sanitize repo_source (routing field)
in.RepoSource = sanitizeShort(in.RepoSource, 64)
// Default empty values to "unknown" for consistency
if in.GPUVendor == "" {
in.GPUVendor = "unknown"
}
if in.GPUPassthrough == "" {
in.GPUPassthrough = "unknown"
}
if in.CPUVendor == "" {
in.CPUVendor = "unknown"
}
// Allow longer error text to capture full installation log + anonymize IPs (GDPR)
in.Error = sanitizeIPs(sanitizeMultiLine(in.Error, 131072))
// Required fields for all requests
if in.RandomID == "" || in.Type == "" || in.NSAPP == "" || in.Status == "" {
return errors.New("missing required fields: random_id, type, nsapp, status")
}
// Normalize common typos for backwards compatibility
if in.Status == "sucess" {
in.Status = "success"
}
// Validate enums with fallback to safe defaults (never reject writes)
if !allowedType[in.Type] {
log.Printf("[WARN] unknown type %q from nsapp=%s, rejecting", in.Type, in.NSAPP)
return errors.New("invalid type")
}
if !allowedStatus[in.Status] {
log.Printf("[WARN] unknown status %q from nsapp=%s, falling back to 'unknown'", in.Status, in.NSAPP)
in.Status = "unknown"
}