Skip to content

Commit b114bf3

Browse files
committed
update library design
1 parent c7121f6 commit b114bf3

33 files changed

Lines changed: 6823 additions & 790 deletions

py-src/data_formulator/agents/agent_data_load.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,21 @@
2727
2828
''' + generate_semantic_types_prompt() + '''
2929
30+
Enriched annotation fields (optional — provide when applicable):
31+
32+
- "intrinsic_domain": [min, max] — the known scale bounds of the measurement instrument.
33+
- Infer from data values and context: e.g., if a "rating" column has values 1-10, the domain is [1, 10]; if it's clearly a 5-star system, use [1, 5].
34+
- For Percentage: [0, 100] if values are whole-number percentages, [0, 1] if fractional.
35+
- For Correlation: always [-1, 1].
36+
- Do NOT provide for open-ended measures like Revenue, Count, Quantity, Temperature, etc.
37+
- Only provide when the scale bounds are clear from the data or domain knowledge.
38+
- "unit": a short unit string for physical/monetary quantities.
39+
- Temperature: "°C", "°F", "K"
40+
- Physical: "kg", "km", "mph", "m²", "L", etc.
41+
- Currency: "USD", "EUR", "¥", etc.
42+
- Duration: "ms", "s", "min", "hr"
43+
- Only provide when the unit is clear from column name, data values, or context.
44+
3045
Sort order:
3146
3247
- if the field is string type and is ordinal, provide the natural sort order of the fields here.
@@ -42,8 +57,12 @@
4257
{
4358
"suggested_table_name": ..., // the name of the table
4459
"fields": {
45-
"field1": {"type": ..., "semantic_type": ..., "sort_order": [...]}, // replace field1 field2 with actual field names, if the field is string type and is ordinal, provide the natural sort order of the fields here
46-
"field2": {"type": ..., "semantic_type": ...}, // no need to provide sort_order if there is no inherent order of the field values
60+
"field1": {"type": ..., "semantic_type": ..., "sort_order": [...], "intrinsic_domain": [...], "unit": ...},
61+
// replace field1 field2 with actual field names
62+
// only include sort_order if the field is ordinal with inherent order
63+
// only include intrinsic_domain if the field has a known bounded scale
64+
// only include unit if the unit is clear from context
65+
"field2": {"type": ..., "semantic_type": ...},
4766
...
4867
},
4968
"data_summary": ... // a short summary of the data (50-100 words), should capture the key characteristics of the data
@@ -82,11 +101,11 @@
82101
{
83102
"suggested_table_name": "income",
84103
"fields": {
85-
"name": {"type": "string", "semantic_type": "State", "sort_order": null},
104+
"name": {"type": "string", "semantic_type": "State"},
86105
"region": {"type": "string", "semantic_type": "Region", "sort_order": ["northeast", "midwest", "south", "west", "other"]},
87-
"state_id": {"type": "number", "semantic_type": "ID", "sort_order": null},
88-
"pct": {"type": "number", "semantic_type": "Percentage", "sort_order": null},
89-
"total": {"type": "number", "semantic_type": "Count", "sort_order": null},
106+
"state_id": {"type": "number", "semantic_type": "ID"},
107+
"pct": {"type": "number", "semantic_type": "Percentage", "intrinsic_domain": [0, 1]},
108+
"total": {"type": "number", "semantic_type": "Count"},
90109
"group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
91110
},
92111
"data_summary": "Income distribution across US states, with percentage and count by income bracket."
@@ -121,18 +140,16 @@
121140
"fields": {
122141
"Date": {
123142
"type": "string",
124-
"semantic_type": "Date",
125-
"sort_order": null
143+
"semantic_type": "Date"
126144
},
127145
"City": {
128146
"type": "string",
129-
"semantic_type": "City",
130-
"sort_order": null
147+
"semantic_type": "City"
131148
},
132149
"Temperature": {
133150
"type": "number",
134151
"semantic_type": "Temperature",
135-
"sort_order": null
152+
"unit": "°F"
136153
}
137154
},
138155
"data_summary": "Daily temperature data comparing Seattle and Atlanta throughout 2020, recording daily temperature measurements for each city from January to September."

py-src/data_formulator/agents/agent_data_rec.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,38 @@
6262
"encodings": {} // object, map visual channels to output field names. Available channels depend on chart_type (see reference below).
6363
"config": {} // object (optional), chart styling options. Available options depend on chart_type (see reference below). Only include when there's a clear reason.
6464
}
65+
"field_metadata": { // object, semantic type for each field used in chart encodings.
66+
"<field_name>": "SemanticType" // string, one of the types from [SEMANTIC TYPE REFERENCE] below.
67+
}
6568
"output_variable": "" // string, descriptive snake_case Python variable name for the final DataFrame.
6669
}
6770
```
6871
72+
**[SEMANTIC TYPE REFERENCE]**
73+
74+
Choose the most specific type that fits. Only annotate fields used in chart encodings.
75+
76+
| Category | Types |
77+
|---|---|
78+
| Temporal | DateTime, Date, Time, Timestamp, Year, Quarter, Month, Week, Day, Hour, YearMonth, YearQuarter, YearWeek, Decade, Duration |
79+
| Monetary measures | Amount, Price, Revenue, Cost |
80+
| Physical measures | Quantity, Temperature |
81+
| Proportion | Percentage |
82+
| Signed/diverging | Profit, PercentageChange, Sentiment, Correlation |
83+
| Generic measures | Count, Number |
84+
| Discrete numeric | Rank, Score, Rating, Index |
85+
| Identifier | ID |
86+
| Geographic | Latitude, Longitude, Country, State, City, Region, Address, ZipCode |
87+
| Entity names | PersonName, Company, Product, Category, Name |
88+
| Coded categorical | Status, Type, Boolean, Direction |
89+
| Binned ranges | Range, AgeGroup |
90+
| Fallback | String, Unknown |
91+
92+
Key guidelines:
93+
- Use **Revenue/Cost** for summed monetary totals, **Price** for per-unit prices, **Profit** for values that can be negative.
94+
- Use **Temperature** (not Quantity) for temperature — it has special diverging behavior.
95+
- Use **Year** (not Number) for columns like "year" with values 2020, 2021.
96+
6997
**[CHART TYPE REFERENCE]**
7098
7199
Each chart type specifies: encodings (visual channels → field types), when to use it, data expectations, and optional config.
@@ -312,6 +340,10 @@
312340
"chart_type": "bar",
313341
"encodings": {"x": "student", "y": "average_score"}
314342
},
343+
"field_metadata": {
344+
"student": "ID",
345+
"average_score": "Score"
346+
},
315347
"output_variable": "student_rankings"
316348
}
317349
```

py-src/data_formulator/agents/agent_data_transform.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,41 @@
103103
"encodings": {}, // object, map visual channels to output field names. Available channels depend on chart_type (see reference below).
104104
"config": {} // object (optional), chart styling options. Available options depend on chart_type (see reference below). Only include when there's a clear reason.
105105
},
106+
"field_metadata": { // object, semantic type for each field used in chart encodings.
107+
"<field_name>": "SemanticType" // string, one of the types from [SEMANTIC TYPE REFERENCE] below.
108+
},
106109
"output_variable": "...", // string, the name of the Python variable containing the final result.
107110
// Should be descriptive and informative (e.g., "sales_by_region", "monthly_revenue", "top_10_products"),
108111
// not generic names like "result_df" or "output". Use snake_case.
109112
"reason": "..." // string, explain why this refinement is made
110113
}
111114
```
112115
116+
**[SEMANTIC TYPE REFERENCE]**
117+
118+
Choose the most specific type for each encoding field.
119+
120+
| Category | Types |
121+
|---|---|
122+
| Temporal | DateTime, Date, Time, Timestamp, Year, Quarter, Month, Week, Day, Hour, YearMonth, YearQuarter, YearWeek, Decade, Duration |
123+
| Monetary measures | Amount, Price, Revenue, Cost |
124+
| Physical measures | Quantity, Temperature |
125+
| Proportion | Percentage |
126+
| Signed/diverging | Profit, PercentageChange, Sentiment, Correlation |
127+
| Generic measures | Count, Number |
128+
| Discrete numeric | Rank, Score, Rating, Index |
129+
| Identifier | ID |
130+
| Geographic | Latitude, Longitude, Country, State, City, Region, Address, ZipCode |
131+
| Entity names | PersonName, Company, Product, Category, Name |
132+
| Coded categorical | Status, Type, Boolean, Direction |
133+
| Binned ranges | Range, AgeGroup |
134+
| Fallback | String, Unknown |
135+
136+
Key guidelines:
137+
- Use **Revenue/Cost** for summed totals, **Price** for per-unit, **Profit** for signed.
138+
- Use **Temperature** for temperature (diverging behavior).
139+
- Use **Year** not Number for year columns.
140+
113141
**[CHART TYPE REFERENCE]**
114142
115143
Each chart type specifies: encodings (visual channels → field types), when to use it, data expectations, and optional config.
@@ -251,6 +279,11 @@
251279
"chart_type": "point",
252280
"encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"}
253281
},
282+
"field_metadata": {
283+
"Seattle Temperature": "Temperature",
284+
"Atlanta Temperature": "Temperature",
285+
"Warmer City": "Category"
286+
},
254287
"output_variable": "city_temp_comparison",
255288
"reason": "To compare Seattle and Atlanta temperatures, we need to pivot the data to have separate temperature columns for each city, then compute which city is warmer."
256289
}

py-src/data_formulator/agents/data_agent.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
coerce_field_type,
3131
resolve_field_type,
3232
spec_to_base64,
33+
field_metadata_to_semantic_types,
3334
)
3435

3536
import pandas as pd
@@ -510,7 +511,10 @@ def _create_chart(
510511
field_type = coerce_field_type(chart_type, channel, field_type)
511512
encodings[channel] = {"field": field, "type": field_type}
512513

513-
spec = assemble_vegailte_chart(df, chart_type, encodings, config=chart_config)
514+
spec = assemble_vegailte_chart(
515+
df, chart_type, encodings, config=chart_config,
516+
semantic_types=field_metadata_to_semantic_types(refined_goal.get("field_metadata")),
517+
)
514518
return spec_to_base64(spec) if spec else None
515519
except Exception as e:
516520
logger.error(f"[DataAgent] Chart creation error: {e}")

0 commit comments

Comments
 (0)