microsoft
diff --git a/‎.devcontainer/devcontainer.json‎
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/devcontainer.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 4 deletions b/‎README.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎package.json‎
Lines changed: 2 additions & 2 deletions b/‎package.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎public/df_stock_prices_live.json‎
Lines changed: 1 addition & 0 deletions b/‎public/df_stock_prices_live.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎public/screenshot-stock-price-live-thumbnail.webp‎
9.46 KB b/‎public/screenshot-stock-price-live-thumbnail.webp‎
9.46 KB
diff --git a/‎py-src/data_formulator/agent_routes.py‎
Lines changed: 0 additions & 21 deletions b/‎py-src/data_formulator/agent_routes.py‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎py-src/data_formulator/agents/agent_exploration.py‎
Lines changed: 2 additions & 6 deletions b/‎py-src/data_formulator/agents/agent_exploration.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎py-src/data_formulator/agents/agent_interactive_explore.py‎
Lines changed: 39 additions & 34 deletions b/‎py-src/data_formulator/agents/agent_interactive_explore.py‎
Lines changed: 39 additions & 34 deletions
@@ -17,7 +17,7 @@
 	// "forwardPorts": [],
 
 	// Use 'postCreateCommand' to run commands after the container is created.
-	"postCreateCommand": "python3 -m venv /workspaces/data-formulator/venv && . /workspaces/data-formulator/venv/bin/activate && pip install https://github.com/user-attachments/files/17319752/data_formulator-0.1.0.tar.gz --verbose && data_formulator"
+	"postCreateCommand": "cd /workspaces/data-formulator && npm install && npm run build && python3 -m venv /workspaces/data-formulator/venv && . /workspaces/data-formulator/venv/bin/activate && pip install -e /workspaces/data-formulator --verbose && data_formulator"
 
 	// Configure tool-specific properties.
 	// "customizations": {},
 
@@ -5,6 +5,7 @@
 .DS_Store
 build/
 dist/
+experiment_data/
 
 ## Ignore Visual Studio temporary files, build results, and
 ## files generated by popular Visual Studio add-ons.
@@ -405,3 +406,7 @@ FodyWeavers.xsd
 # JetBrains Rider
 *.sln.iml
 venv
+
+
+\.\NUL
+NUL
@@ -32,6 +32,10 @@ https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2
 
 
 ## News 🔥🔥🔥
+[01-25-2025] **Data Formulator 0.6** — Real-time insights from live data
+-  ⚡ **Connect to live data**: Connect to URLs and databases with automatic refresh intervals. Visualizations update automatically as your data changes to provide you live insights. [Demo: track international space station position speed live](https://github.com/microsoft/data-formulator/releases/tag/0.6)
+-  🎨 **UI Updates**: Unified UI for data loading; direct drag-and-drop fields from the data table to update visualization designs.
+
 [12-08-2025] **Data Formulator 0.5.1** — Connect more, visualize more, move faster
 - 🔌 **Community data loaders**: Google BigQuery, MySQL, Postgres, MongoDB
 - 📊 **New chart types**: US Map & Pie Chart (more to be added soon)
@@ -41,10 +45,7 @@ https://github.com/user-attachments/assets/8ca57b68-4d7a-42cb-bcce-43f8b1681ce2
 [11-07-2025] Data Formulator 0.5: Vibe with your data, in control
 
 - 📊 **Load (almost) any data**: load structured data, extract data from screenshots, from messy text blocks, or connect to databases.
-- 🤖 **Explore data with AI agents**: 
-  - In agent mode, provide a high-level goal and ask agents to explore data for you.
-  - To stay in control, directly interact with agents: ask for recommendations or specify chart designs with UI + NL inputs, and AI agents will formulate data to realize your design.
-  - Use data threads to control branching exploration paths: backtrack, branch, or follow up.
+- 🤖 **Explore data with AI agents**: Use agent mode for hands-off exploration, or stay in control in interactive mode.
 - ✅ **Verify AI generated results**: interact with charts and inspect data, formulas, explanations, and code.
 - 📝 **Create reports to share insights**: choose charts you want to share, and ask agents to create reports grounded in data formulated throughout exploration.
 
 
@@ -20,7 +20,7 @@
         "html2canvas": "^1.4.1",
         "katex": "^0.16.22",
         "localforage": "^1.10.0",
-        "lodash": "^4.17.21",
+        "lodash": "^4.17.23",
         "markdown-to-jsx": "^7.4.0",
         "mui-markdown": "^2.0.3",
         "prettier": "^2.8.3",
@@ -46,7 +46,7 @@
         "validator": "^13.15.20",
         "vega": "^6.2.0",
         "vega-embed": "^6.21.0",
-        "vega-lite": "^5.5.0",
+        "vega-lite": "6.4.1",
         "vm-browserify": "^1.1.2"
     },
     "scripts": {
 
@@ -31,7 +31,6 @@
 from data_formulator.agents.agent_data_clean import DataCleanAgent
 from data_formulator.agents.agent_data_clean_stream import DataCleanAgentStream
 from data_formulator.agents.agent_code_explanation import CodeExplanationAgent
-from data_formulator.agents.agent_query_completion import QueryCompletionAgent
 from data_formulator.agents.agent_interactive_explore import InteractiveExploreAgent
 from data_formulator.agents.agent_report_gen import ReportGenAgent
 from data_formulator.agents.client_utils import Client
@@ -614,26 +613,6 @@ def request_code_expl():
     else:
         return jsonify({'error': 'Invalid request format'}), 400
 
-@agent_bp.route('/query-completion', methods=['POST'])
-def query_completion():
-    if request.is_json:
-        logger.info("# request data: ")
-        content = request.get_json()        
-
-        client = get_client(content['model'])
-
-        data_source_metadata = content["data_source_metadata"]
-        query = content["query"]
-
-        query_completion_agent = QueryCompletionAgent(client=client)
-        reasoning, query = query_completion_agent.run(data_source_metadata, query)
-        response = flask.jsonify({ "token": "", "status": "ok", "reasoning": reasoning, "query": query })
-    else:
-        response = flask.jsonify({ "token": "", "status": "error", "reasoning": "unable to complete query", "query": "" })
-
-    response.headers.add('Access-Control-Allow-Origin', '*')
-    return response
-
 @agent_bp.route('/get-recommendation-questions', methods=['GET', 'POST'])
 def get_recommendation_questions():
     def generate():
 
@@ -6,7 +6,7 @@
 import base64
 
 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary
-from data_formulator.agents.agent_sql_data_transform import get_sql_table_statistics_str, sanitize_table_name
+from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary
 
 logger = logging.getLogger(__name__)
 
@@ -151,11 +151,7 @@ def get_chart_message(self, visualization):
 
     def get_data_summary(self, input_tables):
         if self.db_conn:
-            data_summary = ""
-            for table in input_tables:
-                table_name = sanitize_table_name(table['name'])
-                table_summary_str = get_sql_table_statistics_str(self.db_conn, table_name)
-                data_summary += f"[TABLE {table_name}]\n\n{table_summary_str}\n\n"
+            data_summary = generate_sql_data_summary(self.db_conn, input_tables)
         else:
             data_summary = generate_data_summary(input_tables)
         return data_summary
 
@@ -6,31 +6,35 @@
 import pandas as pd
 
 from data_formulator.agents.agent_utils import extract_json_objects, generate_data_summary
-from data_formulator.agents.agent_sql_data_transform import get_sql_table_statistics_str, sanitize_table_name
+from data_formulator.agents.agent_sql_data_transform import generate_sql_data_summary
 
 logger = logging.getLogger(__name__)
 
 SYSTEM_PROMPT = '''You are a data exploration expert who suggests interesting questions to help users explore their datasets.
 
-Given a dataset (or a thread of datasets that have been explored), your task is to suggest 4 exploration questions (unless the user explicitly asks for the number of questions), that users can follow to gain insights from their data.
-* the user may provide you current explorations they have done, including:
-    - a thread of exploration questions they have explored
-    - the latest data sample they are viewing
-    - the current chart they are viewing
-* when the exploration context is provided, make your suggestion based on the context as well as the original dataset; otherwise leverage the original dataset to suggest questions.
+This prompt contains the following sections:
+- [DATASETS] section: available datasets the user is working with.
+- [EXPLORATION THREAD] section (optional): sequence of datasets that have been explored in the order they were created, and what questions are asked to create them. These tables are all created from tables in the [DATASETS] section.
+- [CURRENT DATA] section (optional): latest data sample the user is viewing, and the visualization they are looking at at the moment.
+- [START QUESTION] section (optional): start question from previous exploration steps for context
+
+Your task is to suggest 4 exploration questions (unless the user explicitly asks for the number of questions), that users can follow to gain insights from their data.
+When the exploration context is provided, make your suggestion based on the context as well as the original datasets; otherwise leverage the original datasets to suggest questions.
 
 Guidelines for question suggestions:
-1. Suggest interesting analytical questions that are not obvious that can uncover nontrivial insights
-2. Use a diverse language style to display the questions (can be questions, statements etc)
-3. If there are multiple datasets in a thread, consider relationships between them
+1. Suggest interesting analytical questions that can uncover new insights from the data.
+2. Use a diverse language style to display the questions (can be questions, statements etc).
+3. If there are multiple datasets in a thread, consider relationships between them.
 4. CONCISENESS: the questions should be concise and to the point
-5. QUESTION: the question should be a new question based on the thread of exploration:
-    - either a followup question, or a new question that is related to the thread
+5. QUESTION: the question should be a new question based on the exploration thread:
+    - if no exploration thread is provided, start with a high-level overview question that directly visualizes the data to give the user a sense of the data.
+    - either a followup question, or a new question that is related to the exploration thread
         - if the current data is rich, you can ask a followup question to further explore the dataset;
         - if the current data is already specialized to answer the previous question, you can ask a new question that is related to the thread but not related to the previous question in the thread, leverage earlier exploration data to ask questions that can expand the exploration horizon
     - do not repeat questions that have already been explored in the thread
     - do not suggest questions that are not related to the thread (e.g. questions that are completely unrelated to the exploration direction in the thread)
     - do not naively follow up if the question is already too low-level when previous iterations have already come into a small subset of the data (suggest new related areas related to the metric / attributes etc)
+    - leverage other datasets in the [DATASETS] section to suggest questions that are related to the exploration thread.
 6. DIVERSITY: the questions should be diverse in difficulty (easy / medium / hard) and the four questions should cover different aspects of the data analysis to expand the user's horizon
     - simple questions should be short -- single sentence exploratory questions
     - medium questions can be 1-2 sentences exploratory questions
@@ -59,15 +63,17 @@
 
 SYSTEM_PROMPT_AGENT = '''You are a data exploration expert to help users explore their datasets.
 
+This prompt contains the following sections:
+- [DATASETS] section: available datasets the user is working with.
+- [EXPLORATION THREAD] section (optional): sequence of datasets that have been explored in the order they were created, and what questions are asked to create them. These tables are all created from tables in the [DATASETS] section.
+- [CURRENT DATA] section (optional): latest data sample the user is viewing, and the visualization they are looking at at the moment.
+- [START QUESTION] section (optional): start question from previous exploration steps for context
+
 Given a dataset (or a thread of datasets that have been explored), your task is to suggest 4 exploration questions (unless the user explicitly asks for the number of questions), that users can follow to gain insights from their data.
-* the user may provide you current explorations they have done, including:
-    - a thread of exploration questions they have explored
-    - the latest data sample they are viewing
-    - the current chart they are viewing
-* when the exploration context is provided, make your suggestion based on the context as well as the original dataset; otherwise leverage the original dataset to suggest questions.
+When the exploration context is provided, make your suggestion based on the context as well as the original datasets; otherwise leverage the original datasets to suggest questions.
 
 Guidelines for question suggestions:
-1. Suggest a list of question_groups of interesting analytical questions that are not obvious that can uncover nontrivial insights.
+1. Suggest a list of question_groups of interesting analytical questions that can uncover new insights from the data.
 2. Use a diverse language style to display the questions (can be questions, statements etc)
 3. If there are multiple datasets in a thread, consider relationships between them
 4. CONCISENESS: the questions should be concise and to the point
@@ -80,6 +86,7 @@
         - hard questions should introduce some new analysis concept but still make it concise
     - if suitable, include a group of questions that are related to statistical analysis: forecasting, regression, or clustering.
 6. QUESTIONS WITHIN A QUESTION GROUP:
+    - if the user doesn't provide an exploration thread, start with a high-level overview question that directly visualizes the data to give the user a sense of the data.
     - raise new questions that are related to the user's goal, do not repeat questions that have already been explored in the context provided to you.
     - if the user provides a start question, suggested questions should be related to the start question.
     - the questions should progressively dive deeper into the data, building on top of the previous question.
@@ -113,15 +120,11 @@ def __init__(self, client, agent_exploration_rules="", db_conn=None):
         self.agent_exploration_rules = agent_exploration_rules
         self.db_conn = db_conn
 
-    def get_data_summary(self, input_tables):
+    def get_data_summary(self, input_tables, table_name_prefix="Table"):
         if self.db_conn:
-            data_summary = ""
-            for table in input_tables:
-                table_name = sanitize_table_name(table['name'])
-                table_summary_str = get_sql_table_statistics_str(self.db_conn, table_name)
-                data_summary += f"[TABLE {table_name}]\n\n{table_summary_str}\n\n"
+            data_summary = generate_sql_data_summary(self.db_conn, input_tables, table_name_prefix=table_name_prefix)
         else:
-            data_summary = generate_data_summary(input_tables, include_data_samples=False)
+            data_summary = generate_data_summary(input_tables, include_data_samples=False, table_name_prefix=table_name_prefix)
         return data_summary
 
     def run(self, input_tables, start_question=None, exploration_thread=None, 
@@ -144,19 +147,21 @@ def run(self, input_tables, start_question=None, exploration_thread=None,
         data_summary = self.get_data_summary(input_tables)
 
         # Build context including exploration thread if available
-        context = f"[DATASET]\n\n{data_summary}"
+        context = f"[DATASETS] These are the datasets the user is working with:\n\n{data_summary}"
 
         if exploration_thread:
-            thread_summary = "Tables in this exploration thread:\n"
-            for i, table in enumerate(exploration_thread, 1):
-                table_name = table.get('name', f'Table {i}')
-                data_summary = self.get_data_summary([{'name': table_name, 'rows': table.get('rows', [])}])
-                table_description = table.get('description', 'No description available')
-                thread_summary += f"{i}. {table_name}: {table_description} \n\n{data_summary}\n\n"
-            context += f"\n\n[EXPLORATION THREAD]\n\n{thread_summary}"
+            thread_summary = self.get_data_summary(
+                [{
+                    'name': table.get('name', f'Table {i}'), 
+                    'rows': table.get('rows', []), 
+                    'attached_metadata': table.get('description', ''),
+                } for i, table in enumerate(exploration_thread, 1)],
+                table_name_prefix="Thread Table"
+            )
+            context += f"\n\n[EXPLORATION THREAD] These are the sequence of tables the user created in this exploration thread, in the order they were created, and what questions are asked to create them:\n\n{thread_summary}"
 
         if current_data_sample:
-            context += f"\n\n[CURRENT DATA SAMPLE]\n\n{pd.DataFrame(current_data_sample).head(10).to_string()}"
+            context += f"\n\n[CURRENT DATA SAMPLE] This is the current data sample the user is viewing, and the visualization they are looking at at the moment is shown below:\n\n{pd.DataFrame(current_data_sample).head(10).to_string()}"
 
         if start_question:
             context += f"\n\n[START QUESTION]\n\n{start_question}"