Skip to content

Commit 257374f

Browse files
committed
adding azure blob data loader
1 parent e8799a6 commit 257374f

8 files changed

Lines changed: 507 additions & 31 deletions

File tree

py-src/data_formulator/data_loader/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
33
from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
44
from data_formulator.data_loader.s3_data_loader import S3DataLoader
5+
from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
56

67
DATA_LOADERS = {
78
"mysql": MySQLDataLoader,
89
"kusto": KustoDataLoader,
910
"s3": S3DataLoader,
11+
"azure_blob": AzureBlobDataLoader,
1012
}
1113

12-
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "DATA_LOADERS"]
14+
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"]

py-src/data_formulator/data_loader/azure_blob_data_loader.py

Lines changed: 363 additions & 0 deletions
Large diffs are not rendered by default.

py-src/data_formulator/data_loader/external_data_loader.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
6767
def list_params() -> List[Dict[str, Any]]:
6868
pass
6969

70+
@staticmethod
71+
@abstractmethod
72+
def auth_instructions() -> str:
73+
pass
74+
7075
@abstractmethod
7176
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
7277
pass

py-src/data_formulator/data_loader/kusto_data_loader.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,41 @@ def list_params() -> bool:
2323
{"name": "tenant_id", "type": "string", "required": False, "description": "only necessary for AppKey auth"}
2424
]
2525
return params_list
26+
27+
@staticmethod
28+
def auth_instructions() -> str:
29+
return """
30+
Azure Kusto Authentication Instructions:
31+
32+
This data loader supports two authentication methods:
33+
34+
**Method 1: Azure CLI Authentication (Recommended for development)**
35+
1. Install Azure CLI: https://docs.microsoft.com/en-us/cli/azure/install-azure-cli
36+
2. Run `az login` in your terminal to authenticate
37+
3. Ensure you have access to the specified Kusto cluster and database
38+
4. Leave client_id, client_secret, and tenant_id parameters empty
39+
40+
**Method 2: Application Key Authentication (Recommended for production)**
41+
1. Register an Azure AD application in your tenant
42+
2. Generate a client secret for the application
43+
3. Grant the application appropriate permissions to your Kusto cluster:
44+
- Go to your Kusto cluster in Azure Portal
45+
- Navigate to Permissions > Add
46+
- Add your application as a user with appropriate role (e.g., "AllDatabasesViewer" for read access)
47+
4. Provide the following parameters:
48+
- client_id: Application (client) ID from your Azure AD app registration
49+
- client_secret: Client secret value you generated
50+
- tenant_id: Directory (tenant) ID from your Azure AD
51+
52+
**Required Parameters:**
53+
- kusto_cluster: Your Kusto cluster URI (e.g., "https://mycluster.region.kusto.windows.net")
54+
- kusto_database: Name of the database you want to access
55+
56+
**Troubleshooting:**
57+
- If authentication fails, ensure you have the correct permissions on the Kusto cluster
58+
- For CLI auth, make sure you're logged in with `az account show`
59+
- For app key auth, verify your client_id, client_secret, and tenant_id are correct
60+
"""
2661

2762
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
2863

py-src/data_formulator/data_loader/mysql_data_loader.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,38 @@ def list_params() -> bool:
1818
]
1919
return params_list
2020

21+
@staticmethod
22+
def auth_instructions() -> str:
23+
return """
24+
MySQL Connection Instructions:
25+
26+
1. **Local MySQL Setup:**
27+
- Ensure MySQL server is running on your machine
28+
- Default connection: host='localhost', user='root'
29+
- If you haven't set a root password, leave password field empty
30+
31+
2. **Remote MySQL Connection:**
32+
- Obtain host address, username, and password from your database administrator
33+
- Ensure the MySQL server allows remote connections
34+
- Check that your IP is whitelisted in MySQL's user permissions
35+
36+
3. **Common Connection Parameters:**
37+
- user: Your MySQL username (default: 'root')
38+
- password: Your MySQL password (leave empty if no password set)
39+
- host: MySQL server address (default: 'localhost')
40+
- database: Target database name to connect to
41+
42+
4. **Troubleshooting:**
43+
- Verify MySQL service is running: `brew services list` (macOS) or `sudo systemctl status mysql` (Linux)
44+
- Test connection: `mysql -u [username] -p -h [host] [database]`
45+
- Common issues: Wrong credentials, server not running, firewall blocking connection
46+
47+
5. **Security Notes:**
48+
- Use dedicated database users with limited privileges for applications
49+
- Avoid using root user for application connections
50+
- Consider using SSL connections for remote databases
51+
"""
52+
2153
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
2254
self.params = params
2355
self.duck_db_conn = duck_db_conn

py-src/data_formulator/data_loader/s3_data_loader.py

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,70 @@ def list_params() -> List[Dict[str, Any]]:
1919
]
2020
return params_list
2121

22+
@staticmethod
23+
def auth_instructions() -> str:
24+
return """
25+
To connect to Amazon S3, you'll need the following AWS credentials and configuration:
26+
27+
**Required Parameters:**
28+
- **AWS Access Key ID**: Your AWS access key identifier
29+
- **AWS Secret Access Key**: Your AWS secret access key
30+
- **Region Name**: The AWS region where your S3 bucket is located (e.g., 'us-east-1', 'us-west-2')
31+
- **Bucket**: The name of your S3 bucket
32+
33+
**Optional Parameters:**
34+
- **AWS Session Token**: Required only if using temporary credentials (e.g., from AWS STS or IAM roles)
35+
36+
**How to Get AWS Credentials:**
37+
38+
1. **AWS IAM User (Recommended for programmatic access):**
39+
- Go to AWS Console → IAM → Users
40+
- Create a new user or select existing user
41+
- Go to "Security credentials" tab
42+
- Click "Create access key"
43+
- Choose "Application running outside AWS"
44+
- Save both the Access Key ID and Secret Access Key securely
45+
46+
2. **Required S3 Permissions:**
47+
Your IAM user/role needs these permissions for the target bucket:
48+
```json
49+
{
50+
"Version": "2012-10-17",
51+
"Statement": [
52+
{
53+
"Effect": "Allow",
54+
"Action": [
55+
"s3:GetObject",
56+
"s3:ListBucket"
57+
],
58+
"Resource": [
59+
"arn:aws:s3:::your-bucket-name",
60+
"arn:aws:s3:::your-bucket-name/*"
61+
]
62+
}
63+
]
64+
}
65+
```
66+
67+
3. **Finding Your Region:**
68+
- Go to S3 Console → Select your bucket → Properties
69+
- Look for "AWS Region" in the bucket overview
70+
71+
**Security Best Practices:**
72+
- Never share your secret access key
73+
- Use IAM roles when possible instead of long-term access keys
74+
- Consider using temporary credentials with session tokens for enhanced security
75+
- Regularly rotate your access keys
76+
- Use the principle of least privilege for S3 permissions
77+
78+
**Supported File Formats:**
79+
- CSV files (.csv)
80+
- Parquet files (.parquet)
81+
- JSON files (.json, .jsonl)
82+
83+
The connector will automatically detect file types and load them appropriately using DuckDB's S3 integration.
84+
"""
85+
2286
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
2387
self.params = params
2488
self.duck_db_conn = duck_db_conn
@@ -120,32 +184,9 @@ def _estimate_row_count(self, s3_url: str) -> int:
120184
count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM read_parquet('{s3_url}')").fetchone()[0]
121185
return count
122186

123-
# For CSV files, we'll sample the file to estimate size
124-
sample_size = 1000
125-
sample_df = self.duck_db_conn.execute(f"SELECT * FROM read_csv_auto('{s3_url}') LIMIT {sample_size}").df()
126-
127-
# Get file size from S3
128-
import boto3
129-
s3_client = boto3.client(
130-
's3',
131-
aws_access_key_id=self.aws_access_key_id,
132-
aws_secret_access_key=self.aws_secret_access_key,
133-
aws_session_token=self.aws_session_token if self.aws_session_token else None,
134-
region_name=self.region_name
135-
)
136-
137-
key = s3_url.replace(f"s3://{self.bucket}/", "")
138-
response = s3_client.head_object(Bucket=self.bucket, Key=key)
139-
file_size = response['ContentLength']
140-
141-
# Estimate based on sample size and file size
142-
if len(sample_df) > 0:
143-
# Calculate average row size in bytes
144-
avg_row_size = file_size / len(sample_df)
145-
estimated_rows = int(file_size / avg_row_size)
146-
return min(estimated_rows, 1000000) # Cap at 1 million for UI performance
147-
148-
return 0
187+
# For CSV, JSON, and JSONL files, we'll skip row count
188+
if s3_url.lower().endswith('.csv') or s3_url.lower().endswith('.json') or s3_url.lower().endswith('.jsonl'):
189+
return 0
149190
except Exception as e:
150191
print(f"Error estimating row count for {s3_url}: {e}")
151192
return 0

src/views/DBTableManager.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,7 @@ export const DBTableSelectionDialog: React.FC<{ buttonElement: any }> = function
653653
sx={{px: 0.5}}
654654
>
655655
<Typography variant="caption" sx={{color: "text.secondary", fontWeight: "bold", px: 1}}>connect external data</Typography>
656-
{["file upload", "mysql", "kusto","s3"].map((dataLoaderType, i) => (
656+
{["file upload", ...Object.keys(dataLoaderParamDefs ?? {})].map((dataLoaderType, i) => (
657657
<Tab
658658
key={`dataLoader:${dataLoaderType}`}
659659
wrapped
@@ -911,7 +911,7 @@ export const DataLoaderForm: React.FC<{
911911
return [
912912
<TableRow
913913
key={tableName}
914-
sx={{ '&:last-child td, &:last-child th': { border: 0 }, '& .MuiTableCell-root': { padding: 0.25 }}}
914+
sx={{ '&:last-child td, &:last-child th': { border: 0 }, '& .MuiTableCell-root': { padding: 0.25, wordWrap: 'break-word', whiteSpace: 'normal' }}}
915915
>
916916
<TableCell sx={{borderBottom: displaySamples[tableName] ? 'none' : '1px solid rgba(0, 0, 0, 0.1)'}}>
917917
<IconButton size="small" onClick={() => toggleDisplaySamples(tableName)}>

src/views/VisualizationView.tsx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,8 +363,6 @@ export const ChartEditorFC: FC<{ cachedCandidates: DictTable[],
363363
const [errorMessage, setErrorMessage] = useState<{content: string, severity: "error" | "warning" | "info" | "success"}>({content: "", severity: "error"});
364364
const [showError, setShowError] = useState<boolean>(false);
365365

366-
367-
368366
let createVisTableRowsLocal = (rows: any[]) => {
369367
if (visFields.length == 0) {
370368
return rows;

0 commit comments

Comments
 (0)