microsoft
diff --git a/‎py-src/data_formulator/agents/agent_data_load.py‎
Lines changed: 28 additions & 11 deletions b/‎py-src/data_formulator/agents/agent_data_load.py‎
Lines changed: 28 additions & 11 deletions
diff --git a/‎py-src/data_formulator/agents/agent_data_rec.py‎
Lines changed: 32 additions & 0 deletions b/‎py-src/data_formulator/agents/agent_data_rec.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎py-src/data_formulator/agents/agent_data_transform.py‎
Lines changed: 33 additions & 0 deletions b/‎py-src/data_formulator/agents/agent_data_transform.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎py-src/data_formulator/agents/data_agent.py‎
Lines changed: 5 additions & 1 deletion b/‎py-src/data_formulator/agents/data_agent.py‎
Lines changed: 5 additions & 1 deletion
@@ -27,6 +27,21 @@
 
 ''' + generate_semantic_types_prompt() + '''
 
+Enriched annotation fields (optional — provide when applicable):
+
+- "intrinsic_domain": [min, max] — the known scale bounds of the measurement instrument.
+    - Infer from data values and context: e.g., if a "rating" column has values 1-10, the domain is [1, 10]; if it's clearly a 5-star system, use [1, 5].
+    - For Percentage: [0, 100] if values are whole-number percentages, [0, 1] if fractional.
+    - For Correlation: always [-1, 1].
+    - Do NOT provide for open-ended measures like Revenue, Count, Quantity, Temperature, etc.
+    - Only provide when the scale bounds are clear from the data or domain knowledge.
+- "unit": a short unit string for physical/monetary quantities.
+    - Temperature: "°C", "°F", "K"
+    - Physical: "kg", "km", "mph", "m²", "L", etc.
+    - Currency: "USD", "EUR", "¥", etc.
+    - Duration: "ms", "s", "min", "hr"
+    - Only provide when the unit is clear from column name, data values, or context.
+
 Sort order:
 
 - if the field is string type and is ordinal, provide the natural sort order of the fields here.
@@ -42,8 +57,12 @@
 {
     "suggested_table_name": ..., // the name of the table
     "fields": {
-        "field1": {"type": ..., "semantic_type": ..., "sort_order": [...]}, // replace field1 field2 with actual field names, if the field is string type and is ordinal, provide the natural sort order of the fields here 
-        "field2": {"type": ..., "semantic_type": ...}, // no need to provide sort_order if there is no inherent order of the field values
+        "field1": {"type": ..., "semantic_type": ..., "sort_order": [...], "intrinsic_domain": [...], "unit": ...},
+        // replace field1 field2 with actual field names
+        // only include sort_order if the field is ordinal with inherent order
+        // only include intrinsic_domain if the field has a known bounded scale
+        // only include unit if the unit is clear from context
+        "field2": {"type": ..., "semantic_type": ...},
         ...
     },
     "data_summary": ... // a short summary of the data (50-100 words), should capture the key characteristics of the data
@@ -82,11 +101,11 @@
 {
     "suggested_table_name": "income",
     "fields": {
-        "name": {"type": "string", "semantic_type": "State", "sort_order": null},
+        "name": {"type": "string", "semantic_type": "State"},
         "region": {"type": "string", "semantic_type": "Region", "sort_order": ["northeast", "midwest", "south", "west", "other"]},
-        "state_id": {"type": "number", "semantic_type": "ID", "sort_order": null},
-        "pct": {"type": "number", "semantic_type": "Percentage", "sort_order": null},
-        "total": {"type": "number", "semantic_type": "Count", "sort_order": null},
+        "state_id": {"type": "number", "semantic_type": "ID"},
+        "pct": {"type": "number", "semantic_type": "Percentage", "intrinsic_domain": [0, 1]},
+        "total": {"type": "number", "semantic_type": "Count"},
         "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
     },
     "data_summary": "Income distribution across US states, with percentage and count by income bracket."
@@ -121,18 +140,16 @@
     "fields": {  
         "Date": {  
             "type": "string",  
-            "semantic_type": "Date",  
-            "sort_order": null  
+            "semantic_type": "Date"  
         },  
         "City": {  
             "type": "string",  
-            "semantic_type": "City",  
-            "sort_order": null  
+            "semantic_type": "City"  
         },  
         "Temperature": {  
             "type": "number",  
             "semantic_type": "Temperature",  
-            "sort_order": null  
+            "unit": "°F"  
         }  
     },  
     "data_summary": "Daily temperature data comparing Seattle and Atlanta throughout 2020, recording daily temperature measurements for each city from January to September."
 
@@ -62,10 +62,38 @@
         "encodings": {} // object, map visual channels to output field names. Available channels depend on chart_type (see reference below).
         "config": {} // object (optional), chart styling options. Available options depend on chart_type (see reference below). Only include when there's a clear reason.
     }
+    "field_metadata": { // object, semantic type for each field used in chart encodings.
+        "<field_name>": "SemanticType" // string, one of the types from [SEMANTIC TYPE REFERENCE] below.
+    }
     "output_variable": "" // string, descriptive snake_case Python variable name for the final DataFrame.
 }
 ```
 
+**[SEMANTIC TYPE REFERENCE]**
+
+Choose the most specific type that fits. Only annotate fields used in chart encodings.
+
+| Category | Types |
+|---|---|
+| Temporal | DateTime, Date, Time, Timestamp, Year, Quarter, Month, Week, Day, Hour, YearMonth, YearQuarter, YearWeek, Decade, Duration |
+| Monetary measures | Amount, Price, Revenue, Cost |
+| Physical measures | Quantity, Temperature |
+| Proportion | Percentage |
+| Signed/diverging | Profit, PercentageChange, Sentiment, Correlation |
+| Generic measures | Count, Number |
+| Discrete numeric | Rank, Score, Rating, Index |
+| Identifier | ID |
+| Geographic | Latitude, Longitude, Country, State, City, Region, Address, ZipCode |
+| Entity names | PersonName, Company, Product, Category, Name |
+| Coded categorical | Status, Type, Boolean, Direction |
+| Binned ranges | Range, AgeGroup |
+| Fallback | String, Unknown |
+
+Key guidelines:
+- Use **Revenue/Cost** for summed monetary totals, **Price** for per-unit prices, **Profit** for values that can be negative.
+- Use **Temperature** (not Quantity) for temperature — it has special diverging behavior.
+- Use **Year** (not Number) for columns like "year" with values 2020, 2021.
+
 **[CHART TYPE REFERENCE]**
 
 Each chart type specifies: encodings (visual channels → field types), when to use it, data expectations, and optional config.
@@ -312,6 +340,10 @@
         "chart_type": "bar",
         "encodings": {"x": "student", "y": "average_score"}
     },
+    "field_metadata": {
+        "student": "ID",
+        "average_score": "Score"
+    },
     "output_variable": "student_rankings"
 }
 ```
 
@@ -103,13 +103,41 @@
         "encodings": {}, // object, map visual channels to output field names. Available channels depend on chart_type (see reference below).
         "config": {} // object (optional), chart styling options. Available options depend on chart_type (see reference below). Only include when there's a clear reason.
     },
+    "field_metadata": { // object, semantic type for each field used in chart encodings.
+        "<field_name>": "SemanticType" // string, one of the types from [SEMANTIC TYPE REFERENCE] below.
+    },
     "output_variable": "...", // string, the name of the Python variable containing the final result.
                         // Should be descriptive and informative (e.g., "sales_by_region", "monthly_revenue", "top_10_products"),
                         // not generic names like "result_df" or "output". Use snake_case.
     "reason": "..." // string, explain why this refinement is made
 }
 ```
 
+**[SEMANTIC TYPE REFERENCE]**
+
+Choose the most specific type for each encoding field.
+
+| Category | Types |
+|---|---|
+| Temporal | DateTime, Date, Time, Timestamp, Year, Quarter, Month, Week, Day, Hour, YearMonth, YearQuarter, YearWeek, Decade, Duration |
+| Monetary measures | Amount, Price, Revenue, Cost |
+| Physical measures | Quantity, Temperature |
+| Proportion | Percentage |
+| Signed/diverging | Profit, PercentageChange, Sentiment, Correlation |
+| Generic measures | Count, Number |
+| Discrete numeric | Rank, Score, Rating, Index |
+| Identifier | ID |
+| Geographic | Latitude, Longitude, Country, State, City, Region, Address, ZipCode |
+| Entity names | PersonName, Company, Product, Category, Name |
+| Coded categorical | Status, Type, Boolean, Direction |
+| Binned ranges | Range, AgeGroup |
+| Fallback | String, Unknown |
+
+Key guidelines:
+- Use **Revenue/Cost** for summed totals, **Price** for per-unit, **Profit** for signed.
+- Use **Temperature** for temperature (diverging behavior).
+- Use **Year** not Number for year columns.
+
 **[CHART TYPE REFERENCE]**
 
 Each chart type specifies: encodings (visual channels → field types), when to use it, data expectations, and optional config.
@@ -251,6 +279,11 @@
         "chart_type": "point",
         "encodings": {"x": "Seattle Temperature", "y": "Atlanta Temperature", "color": "Warmer City"}
     },
+    "field_metadata": {
+        "Seattle Temperature": "Temperature",
+        "Atlanta Temperature": "Temperature",
+        "Warmer City": "Category"
+    },
     "output_variable": "city_temp_comparison",
     "reason": "To compare Seattle and Atlanta temperatures, we need to pivot the data to have separate temperature columns for each city, then compute which city is warmer."
 }
 
@@ -30,6 +30,7 @@
     coerce_field_type,
     resolve_field_type,
     spec_to_base64,
+    field_metadata_to_semantic_types,
 )
 
 import pandas as pd
@@ -510,7 +511,10 @@ def _create_chart(
                     field_type = coerce_field_type(chart_type, channel, field_type)
                     encodings[channel] = {"field": field, "type": field_type}
 
-            spec = assemble_vegailte_chart(df, chart_type, encodings, config=chart_config)
+            spec = assemble_vegailte_chart(
+                df, chart_type, encodings, config=chart_config,
+                semantic_types=field_metadata_to_semantic_types(refined_goal.get("field_metadata")),
+            )
             return spec_to_base64(spec) if spec else None
         except Exception as e:
             logger.error(f"[DataAgent] Chart creation error: {e}")