diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go index 80f7eba..1b63d92 100644 --- a/internal/cli/dataset.go +++ b/internal/cli/dataset.go @@ -483,7 +483,7 @@ contributors train against it without ever seeing the raw files.`)) } a.Spec.Schema = sch } else { - sch, skipped, ierr := push.InferSchema(layout.LabelsCSV) + sch, skipped, empty, ierr := push.InferSchema(layout.LabelsCSV) if ierr != nil { return &exitError{code: 3, err: fmt.Errorf("inferring schema from CSV: %w", ierr)} } @@ -495,6 +495,11 @@ contributors train against it without ever seeing the raw files.`)) _, _ = fmt.Fprintf(out, " (skipped framework-managed column(s): %s)\n", strings.Join(skipped, ", ")) } + if len(empty) > 0 { + _, _ = fmt.Fprintf(out, + " (warning: %d column(s) had no values in the sample and were typed FLOAT (nullable): %s)\n", + len(empty), strings.Join(empty, ", ")) + } } case push.IsImage(a.Spec.Category): // keypoint_detection needs --number-of-keypoints (dataset- diff --git a/internal/push/tabular.go b/internal/push/tabular.go index 8b10e74..d8aa87d 100644 --- a/internal/push/tabular.go +++ b/internal/push/tabular.go @@ -145,8 +145,10 @@ func ParseSchema(s string) (map[string]string, error) { // InferSchema reads the CSV header and a sample of rows and infers a // column→SQL-type map: all-integer columns → INT, otherwise // all-numeric → FLOAT, otherwise VARCHAR(255). Empty cells are -// ignored when judging a column; a column with no non-empty sampled -// value falls back to VARCHAR(255). +// ignored when judging a column; a column with NO non-empty sampled +// value is typed as a nullable FLOAT (not VARCHAR — an all-NULL VARCHAR +// is exactly what the ingestor's string validator rejects) and returned +// in `empty` so the caller can warn. // // It's a convenience so customers don't hand-write a --schema for the // common case. Non-numeric specials (timestamps, dates, booleans) @@ -155,20 +157,20 @@ func ParseSchema(s string) (map[string]string, error) { // Framework-managed columns (see reservedColumns — id, data_id, …) // are skipped and returned as the second value so the caller can tell // the customer they weren't included. -func InferSchema(csvPath string) (schema map[string]string, skipped []string, err error) { +func InferSchema(csvPath string) (schema map[string]string, skipped, empty []string, err error) { f, err := os.Open(csvPath) if err != nil { - return nil, nil, err + return nil, nil, nil, err } defer func() { _ = f.Close() }() r := csv.NewReader(f) header, err := r.Read() if err != nil { - return nil, nil, fmt.Errorf("reading CSV header from %s: %w", csvPath, err) + return nil, nil, nil, fmt.Errorf("reading CSV header from %s: %w", csvPath, err) } if len(header) == 0 { - return nil, nil, fmt.Errorf("CSV %s has no columns", csvPath) + return nil, nil, nil, fmt.Errorf("CSV %s has no columns", csvPath) } // Per-column running judgement. @@ -186,7 +188,7 @@ func InferSchema(csvPath string) (schema map[string]string, skipped []string, er break } if err != nil { - return nil, nil, fmt.Errorf("reading CSV row from %s: %w", csvPath, err) + return nil, nil, nil, fmt.Errorf("reading CSV row from %s: %w", csvPath, err) } for i := 0; i < len(header) && i < len(row); i++ { v := strings.TrimSpace(row[i]) @@ -221,9 +223,19 @@ func InferSchema(csvPath string) (schema map[string]string, skipped []string, er schema[col] = "INT" case sawValue[i] && couldBeFloat[i]: schema[col] = "FLOAT" + case !sawValue[i]: + // Entirely empty in the sample (e.g. an unmeasured analyte in a + // sparse panel). It can't be typed from data; default to a + // nullable FLOAT rather than VARCHAR — a tabular feature column + // is numeric far more often than text, and an all-NULL VARCHAR + // is exactly the shape the ingestor's string validator rejects. + // Reported in `empty` so the caller can warn / the user can + // --schema-override. + schema[col] = "FLOAT" + empty = append(empty, col) default: schema[col] = "VARCHAR(255)" } } - return schema, skipped, nil + return schema, skipped, empty, nil } diff --git a/internal/push/tabular_test.go b/internal/push/tabular_test.go index 322101c..28c6d02 100644 --- a/internal/push/tabular_test.go +++ b/internal/push/tabular_test.go @@ -66,7 +66,7 @@ func TestInferSchema(t *testing.T) { csv := writeFile(t, dir, "data.csv", "count,age,price,name\n1,30,9.99,alice\n2,40,19.5,bob\n") - schema, _, err := InferSchema(csv) + schema, _, _, err := InferSchema(csv) if err != nil { t.Fatalf("InferSchema: %v", err) } @@ -83,22 +83,26 @@ func TestInferSchema(t *testing.T) { } } -// TestInferSchema_EmptyColumnIsVarchar: a column with no non-empty -// sampled value can't be typed, so it falls back to VARCHAR(255) -// rather than being mislabeled INT/FLOAT. -func TestInferSchema_EmptyColumnIsVarchar(t *testing.T) { +// TestInferSchema_EmptyColumnIsFloat: a column with no non-empty sampled +// value can't be typed from data; it's returned as a nullable FLOAT (not +// VARCHAR — an all-NULL VARCHAR is what the ingestor's string validator +// rejects) and reported in the `empty` return so the caller can warn. +func TestInferSchema_EmptyColumnIsFloat(t *testing.T) { dir := t.TempDir() csv := writeFile(t, dir, "data.csv", "filled,empty\n1,\n2,\n") - schema, _, err := InferSchema(csv) + schema, _, empty, err := InferSchema(csv) if err != nil { t.Fatalf("InferSchema: %v", err) } - if schema["empty"] != "VARCHAR(255)" { - t.Errorf("schema[empty] = %q, want VARCHAR(255)", schema["empty"]) + if schema["empty"] != "FLOAT" { + t.Errorf("schema[empty] = %q, want FLOAT", schema["empty"]) } if schema["filled"] != "INT" { t.Errorf("schema[filled] = %q, want INT", schema["filled"]) } + if len(empty) != 1 || empty[0] != "empty" { + t.Errorf("empty = %v, want [empty]", empty) + } } // TestInferSchema_SkipsReservedColumns: a CSV with an `id` (or other @@ -109,7 +113,7 @@ func TestInferSchema_SkipsReservedColumns(t *testing.T) { dir := t.TempDir() csv := writeFile(t, dir, "data.csv", "id,feature_00,label\n1,1.5,0\n2,2.5,1\n") - schema, skipped, err := InferSchema(csv) + schema, skipped, _, err := InferSchema(csv) if err != nil { t.Fatalf("InferSchema: %v", err) }