Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion internal/cli/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ contributors train against it without ever seeing the raw files.`))
}
a.Spec.Schema = sch
} else {
sch, skipped, ierr := push.InferSchema(layout.LabelsCSV)
sch, skipped, empty, ierr := push.InferSchema(layout.LabelsCSV)
if ierr != nil {
return &exitError{code: 3, err: fmt.Errorf("inferring schema from CSV: %w", ierr)}
}
Expand All @@ -495,6 +495,11 @@ contributors train against it without ever seeing the raw files.`))
_, _ = fmt.Fprintf(out,
" (skipped framework-managed column(s): %s)\n", strings.Join(skipped, ", "))
}
if len(empty) > 0 {
_, _ = fmt.Fprintf(out,
" (warning: %d column(s) had no values in the sample and were typed FLOAT (nullable): %s)\n",
len(empty), strings.Join(empty, ", "))
}
}
case push.IsImage(a.Spec.Category):
// keypoint_detection needs --number-of-keypoints (dataset-
Expand Down
28 changes: 20 additions & 8 deletions internal/push/tabular.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,10 @@ func ParseSchema(s string) (map[string]string, error) {
// InferSchema reads the CSV header and a sample of rows and infers a
// column→SQL-type map: all-integer columns → INT, otherwise
// all-numeric → FLOAT, otherwise VARCHAR(255). Empty cells are
// ignored when judging a column; a column with no non-empty sampled
// value falls back to VARCHAR(255).
// ignored when judging a column; a column with NO non-empty sampled
// value is typed as a nullable FLOAT (not VARCHAR — an all-NULL VARCHAR
// is exactly what the ingestor's string validator rejects) and returned
// in `empty` so the caller can warn.
//
// It's a convenience so customers don't hand-write a --schema for the
// common case. Non-numeric specials (timestamps, dates, booleans)
Expand All @@ -155,20 +157,20 @@ func ParseSchema(s string) (map[string]string, error) {
// Framework-managed columns (see reservedColumns — id, data_id, …)
// are skipped and returned as the second value so the caller can tell
// the customer they weren't included.
func InferSchema(csvPath string) (schema map[string]string, skipped []string, err error) {
func InferSchema(csvPath string) (schema map[string]string, skipped, empty []string, err error) {
f, err := os.Open(csvPath)
if err != nil {
return nil, nil, err
return nil, nil, nil, err
}
defer func() { _ = f.Close() }()

r := csv.NewReader(f)
header, err := r.Read()
if err != nil {
return nil, nil, fmt.Errorf("reading CSV header from %s: %w", csvPath, err)
return nil, nil, nil, fmt.Errorf("reading CSV header from %s: %w", csvPath, err)
}
if len(header) == 0 {
return nil, nil, fmt.Errorf("CSV %s has no columns", csvPath)
return nil, nil, nil, fmt.Errorf("CSV %s has no columns", csvPath)
}

// Per-column running judgement.
Expand All @@ -186,7 +188,7 @@ func InferSchema(csvPath string) (schema map[string]string, skipped []string, er
break
}
if err != nil {
return nil, nil, fmt.Errorf("reading CSV row from %s: %w", csvPath, err)
return nil, nil, nil, fmt.Errorf("reading CSV row from %s: %w", csvPath, err)
}
for i := 0; i < len(header) && i < len(row); i++ {
v := strings.TrimSpace(row[i])
Expand Down Expand Up @@ -221,9 +223,19 @@ func InferSchema(csvPath string) (schema map[string]string, skipped []string, er
schema[col] = "INT"
case sawValue[i] && couldBeFloat[i]:
schema[col] = "FLOAT"
case !sawValue[i]:
// Entirely empty in the sample (e.g. an unmeasured analyte in a
// sparse panel). It can't be typed from data; default to a
// nullable FLOAT rather than VARCHAR — a tabular feature column
// is numeric far more often than text, and an all-NULL VARCHAR
// is exactly the shape the ingestor's string validator rejects.
// Reported in `empty` so the caller can warn / the user can
// --schema-override.
schema[col] = "FLOAT"
empty = append(empty, col)
default:
schema[col] = "VARCHAR(255)"
}
}
return schema, skipped, nil
return schema, skipped, empty, nil
}
22 changes: 13 additions & 9 deletions internal/push/tabular_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ func TestInferSchema(t *testing.T) {
csv := writeFile(t, dir, "data.csv",
"count,age,price,name\n1,30,9.99,alice\n2,40,19.5,bob\n")

schema, _, err := InferSchema(csv)
schema, _, _, err := InferSchema(csv)
if err != nil {
t.Fatalf("InferSchema: %v", err)
}
Expand All @@ -83,22 +83,26 @@ func TestInferSchema(t *testing.T) {
}
}

// TestInferSchema_EmptyColumnIsVarchar: a column with no non-empty
// sampled value can't be typed, so it falls back to VARCHAR(255)
// rather than being mislabeled INT/FLOAT.
func TestInferSchema_EmptyColumnIsVarchar(t *testing.T) {
// TestInferSchema_EmptyColumnIsFloat: a column with no non-empty sampled
// value can't be typed from data; it's returned as a nullable FLOAT (not
// VARCHAR — an all-NULL VARCHAR is what the ingestor's string validator
// rejects) and reported in the `empty` return so the caller can warn.
func TestInferSchema_EmptyColumnIsFloat(t *testing.T) {
dir := t.TempDir()
csv := writeFile(t, dir, "data.csv", "filled,empty\n1,\n2,\n")
schema, _, err := InferSchema(csv)
schema, _, empty, err := InferSchema(csv)
if err != nil {
t.Fatalf("InferSchema: %v", err)
}
if schema["empty"] != "VARCHAR(255)" {
t.Errorf("schema[empty] = %q, want VARCHAR(255)", schema["empty"])
if schema["empty"] != "FLOAT" {
t.Errorf("schema[empty] = %q, want FLOAT", schema["empty"])
}
if schema["filled"] != "INT" {
t.Errorf("schema[filled] = %q, want INT", schema["filled"])
}
if len(empty) != 1 || empty[0] != "empty" {
t.Errorf("empty = %v, want [empty]", empty)
}
}

// TestInferSchema_SkipsReservedColumns: a CSV with an `id` (or other
Expand All @@ -109,7 +113,7 @@ func TestInferSchema_SkipsReservedColumns(t *testing.T) {
dir := t.TempDir()
csv := writeFile(t, dir, "data.csv", "id,feature_00,label\n1,1.5,0\n2,2.5,1\n")

schema, skipped, err := InferSchema(csv)
schema, skipped, _, err := InferSchema(csv)
if err != nil {
t.Fatalf("InferSchema: %v", err)
}
Expand Down
Loading