Overview
This tutorial shows you how to create a complete data analysis pipeline that:- Uploads and processes CSV/Excel files
- Performs statistical analysis
- Generates visualizations
- Creates summary reports
- Downloads results
Prerequisites
- Python SDK installed (
pip install hopx-ai) - API key configured (see Authentication)
- Basic understanding of Pandas and NumPy
Step 1: Basic Data Processing
Start with a simple data processing function:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
def analyze_data(csv_content: str):
"""Analyze CSV data and return statistics"""
with Sandbox.create(template="code-interpreter") as sandbox:
# Upload data
sandbox.files.write("/workspace/data.csv", csv_content)
# Analyze
result = sandbox.run_code("""
import pandas as pd
# Load data
df = pd.read_csv('/workspace/data.csv')
# Basic statistics
print("Dataset Shape:", df.shape)
print("\\nSummary Statistics:")
print(df.describe())
# Save results
stats = df.describe().to_dict()
import json
with open('/workspace/stats.json', 'w') as f:
json.dump(stats, f, indent=2)
""")
# Download results
stats = sandbox.files.read("/workspace/stats.json")
return {
"output": result.stdout,
"statistics": stats
}
# Example usage
csv_data = """name,age,score
Alice,25,95
Bob,30,87
Charlie,35,92
"""
results = analyze_data(csv_data)
print(results["output"])
Step 2: Add Visualizations
Generate plots and charts:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
import base64
def analyze_with_visualizations(csv_content: str):
"""Analyze data and generate visualizations"""
with Sandbox.create(template="code-interpreter") as sandbox:
# Upload data
sandbox.files.write("/workspace/data.csv", csv_content)
# Install packages if needed
sandbox.commands.run("pip install matplotlib seaborn --quiet")
# Analyze and visualize
result = sandbox.run_code("""
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
df = pd.read_csv('/workspace/data.csv')
# Set style
sns.set_style('whitegrid')
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Data Analysis Dashboard', fontsize=16, fontweight='bold')
# Age distribution
axes[0, 0].hist(df['age'], bins=10, edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Count')
# Score distribution
axes[0, 1].hist(df['score'], bins=10, color='green', edgecolor='black')
axes[0, 1].set_title('Score Distribution')
axes[0, 1].set_xlabel('Score')
axes[0, 1].set_ylabel('Count')
# Age vs Score scatter
axes[1, 0].scatter(df['age'], df['score'], s=100, alpha=0.6)
axes[1, 0].set_title('Age vs Score')
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Score')
axes[1, 0].grid(True, alpha=0.3)
# Correlation heatmap
if len(df.select_dtypes(include=['number']).columns) > 1:
correlation = df.select_dtypes(include=['number']).corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 1])
axes[1, 1].set_title('Correlation Matrix')
plt.tight_layout()
plt.savefig('/workspace/analysis.png', dpi=150, bbox_inches='tight')
print("✅ Visualization saved!")
""")
# Download visualization
image_data = sandbox.files.read_bytes("/workspace/analysis.png")
return {
"output": result.stdout,
"visualization": base64.b64encode(image_data).decode('utf-8')
}
# Example usage
csv_data = """name,age,score
Alice,25,95
Bob,30,87
Charlie,35,92
Diana,28,98
Eve,32,89
"""
results = analyze_with_visualizations(csv_data)
# results["visualization"] contains base64-encoded PNG
Step 3: Complete Pipeline
Build a complete pipeline with multiple steps:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
from typing import Dict, Any
import json
class DataAnalysisPipeline:
"""Complete data analysis pipeline"""
def __init__(self):
self.template = "code-interpreter"
def run(self, csv_content: str) -> Dict[str, Any]:
"""Run complete analysis pipeline"""
with Sandbox.create(template=self.template) as sandbox:
# Step 1: Upload data
sandbox.files.write("/workspace/data.csv", csv_content)
# Step 2: Data cleaning and validation
sandbox.run_code("""
import pandas as pd
df = pd.read_csv('/workspace/data.csv')
# Data cleaning
df = df.dropna()
df = df.drop_duplicates()
# Save cleaned data
df.to_csv('/workspace/cleaned_data.csv', index=False)
print(f"Cleaned data: {len(df)} rows")
""")
# Step 3: Statistical analysis
stats_result = sandbox.run_code("""
import pandas as pd
import json
df = pd.read_csv('/workspace/cleaned_data.csv')
# Calculate statistics
stats = {
'row_count': len(df),
'column_count': len(df.columns),
'summary': df.describe().to_dict(),
'missing_values': df.isnull().sum().to_dict(),
'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()}
}
# Save statistics
with open('/workspace/stats.json', 'w') as f:
json.dump(stats, f, indent=2)
print("Statistics calculated")
""")
# Step 4: Generate visualizations
viz_result = sandbox.run_code("""
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/workspace/cleaned_data.csv')
# Get numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
if len(numeric_cols) > 0:
fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4 * len(numeric_cols)))
if len(numeric_cols) == 1:
axes = [axes]
for i, col in enumerate(numeric_cols):
axes[i].hist(df[col], bins=20, edgecolor='black')
axes[i].set_title(f'{col} Distribution')
axes[i].set_xlabel(col)
axes[i].set_ylabel('Frequency')
plt.tight_layout()
plt.savefig('/workspace/distributions.png', dpi=150, bbox_inches='tight')
print("Visualizations generated")
else:
print("No numeric columns for visualization")
""")
# Step 5: Generate report
report_result = sandbox.run_code("""
import pandas as pd
import json
df = pd.read_csv('/workspace/cleaned_data.csv')
with open('/workspace/stats.json') as f:
stats = json.load(f)
# Generate report
report = f"""
# Data Analysis Report
## Dataset Overview
- Rows: {stats['row_count']}
- Columns: {stats['column_count']}
## Summary Statistics
{pd.DataFrame(stats['summary']).to_markdown()}
## Data Quality
- Missing values: {sum(stats['missing_values'].values())}
"""
with open('/workspace/report.md', 'w') as f:
f.write(report)
print("Report generated")
""")
# Download all results
results = {
"statistics": json.loads(sandbox.files.read("/workspace/stats.json")),
"report": sandbox.files.read("/workspace/report.md"),
"outputs": {
"cleaning": stats_result.stdout,
"analysis": stats_result.stdout,
"visualization": viz_result.stdout,
"report": report_result.stdout
}
}
# Download visualization if it exists
try:
image_data = sandbox.files.read_bytes("/workspace/distributions.png")
results["visualization"] = image_data
except:
pass
return results
# Usage
pipeline = DataAnalysisPipeline()
csv_data = """name,age,score,department
Alice,25,95,Engineering
Bob,30,87,Marketing
Charlie,35,92,Engineering
Diana,28,98,Sales
Eve,32,89,Marketing
"""
results = pipeline.run(csv_data)
print(results["report"])
Best Practices
✅ DO: Use Appropriate Resources
✅ DO: Use Appropriate Resources
For data analysis workloads, choose templates with sufficient resources:
- Python
- JavaScript/TypeScript
Copy
# Choose a template with appropriate resources for data analysis
# Resources are determined by the template, not specified during creation
sandbox = Sandbox.create(template="code-interpreter")
# Check template resources
template = Sandbox.get_template("code-interpreter")
print(f"Template resources: {template.resources.vcpu} vCPU, {template.resources.memory_mb}MB RAM")
✅ DO: Handle Large Files
✅ DO: Handle Large Files
For large datasets, process data in chunks within the sandbox:
Copy
# Process large CSV in chunks
code = """
import pandas as pd
# Read CSV in chunks
chunk_size = 10000
for chunk in pd.read_csv('/workspace/large_file.csv', chunksize=chunk_size):
# Process each chunk
process_chunk(chunk)
"""
sandbox.run_code(code)
✅ DO: Cache Intermediate Results
✅ DO: Cache Intermediate Results
Save intermediate results to avoid recomputation:
Copy
# Save cleaned data
sandbox.run_code("df.to_csv('/workspace/cleaned_data.csv')")
# Use cleaned data in next step
sandbox.run_code("df = pd.read_csv('/workspace/cleaned_data.csv')")

