|
27 | 27 |
|
28 | 28 | ''' + generate_semantic_types_prompt() + ''' |
29 | 29 |
|
| 30 | +Enriched annotation fields (optional — provide when applicable): |
| 31 | +
|
| 32 | +- "intrinsic_domain": [min, max] — the known scale bounds of the measurement instrument. |
| 33 | + - Infer from data values and context: e.g., if a "rating" column has values 1-10, the domain is [1, 10]; if it's clearly a 5-star system, use [1, 5]. |
| 34 | + - For Percentage: [0, 100] if values are whole-number percentages, [0, 1] if fractional. |
| 35 | + - For Correlation: always [-1, 1]. |
| 36 | + - Do NOT provide for open-ended measures like Revenue, Count, Quantity, Temperature, etc. |
| 37 | + - Only provide when the scale bounds are clear from the data or domain knowledge. |
| 38 | +- "unit": a short unit string for physical/monetary quantities. |
| 39 | + - Temperature: "°C", "°F", "K" |
| 40 | + - Physical: "kg", "km", "mph", "m²", "L", etc. |
| 41 | + - Currency: "USD", "EUR", "¥", etc. |
| 42 | + - Duration: "ms", "s", "min", "hr" |
| 43 | + - Only provide when the unit is clear from column name, data values, or context. |
| 44 | +
|
30 | 45 | Sort order: |
31 | 46 |
|
32 | 47 | - if the field is string type and is ordinal, provide the natural sort order of the fields here. |
|
42 | 57 | { |
43 | 58 | "suggested_table_name": ..., // the name of the table |
44 | 59 | "fields": { |
45 | | - "field1": {"type": ..., "semantic_type": ..., "sort_order": [...]}, // replace field1 field2 with actual field names, if the field is string type and is ordinal, provide the natural sort order of the fields here |
46 | | - "field2": {"type": ..., "semantic_type": ...}, // no need to provide sort_order if there is no inherent order of the field values |
| 60 | + "field1": {"type": ..., "semantic_type": ..., "sort_order": [...], "intrinsic_domain": [...], "unit": ...}, |
| 61 | + // replace field1 field2 with actual field names |
| 62 | + // only include sort_order if the field is ordinal with inherent order |
| 63 | + // only include intrinsic_domain if the field has a known bounded scale |
| 64 | + // only include unit if the unit is clear from context |
| 65 | + "field2": {"type": ..., "semantic_type": ...}, |
47 | 66 | ... |
48 | 67 | }, |
49 | 68 | "data_summary": ... // a short summary of the data (50-100 words), should capture the key characteristics of the data |
|
82 | 101 | { |
83 | 102 | "suggested_table_name": "income", |
84 | 103 | "fields": { |
85 | | - "name": {"type": "string", "semantic_type": "State", "sort_order": null}, |
| 104 | + "name": {"type": "string", "semantic_type": "State"}, |
86 | 105 | "region": {"type": "string", "semantic_type": "Region", "sort_order": ["northeast", "midwest", "south", "west", "other"]}, |
87 | | - "state_id": {"type": "number", "semantic_type": "ID", "sort_order": null}, |
88 | | - "pct": {"type": "number", "semantic_type": "Percentage", "sort_order": null}, |
89 | | - "total": {"type": "number", "semantic_type": "Count", "sort_order": null}, |
| 106 | + "state_id": {"type": "number", "semantic_type": "ID"}, |
| 107 | + "pct": {"type": "number", "semantic_type": "Percentage", "intrinsic_domain": [0, 1]}, |
| 108 | + "total": {"type": "number", "semantic_type": "Count"}, |
90 | 109 | "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]} |
91 | 110 | }, |
92 | 111 | "data_summary": "Income distribution across US states, with percentage and count by income bracket." |
|
121 | 140 | "fields": { |
122 | 141 | "Date": { |
123 | 142 | "type": "string", |
124 | | - "semantic_type": "Date", |
125 | | - "sort_order": null |
| 143 | + "semantic_type": "Date" |
126 | 144 | }, |
127 | 145 | "City": { |
128 | 146 | "type": "string", |
129 | | - "semantic_type": "City", |
130 | | - "sort_order": null |
| 147 | + "semantic_type": "City" |
131 | 148 | }, |
132 | 149 | "Temperature": { |
133 | 150 | "type": "number", |
134 | 151 | "semantic_type": "Temperature", |
135 | | - "sort_order": null |
| 152 | + "unit": "°F" |
136 | 153 | } |
137 | 154 | }, |
138 | 155 | "data_summary": "Daily temperature data comparing Seattle and Atlanta throughout 2020, recording daily temperature measurements for each city from January to September." |
|
0 commit comments