Skip to main content
Learn how to build a production-ready data analysis pipeline that processes data, generates visualizations, and creates reports using Hopx Sandboxes.

Overview

This tutorial shows you how to create a complete data analysis pipeline that:
  • Uploads and processes CSV/Excel files
  • Performs statistical analysis
  • Generates visualizations
  • Creates summary reports
  • Downloads results

Prerequisites

  • Python SDK installed (pip install hopx-ai)
  • API key configured (see Authentication)
  • Basic understanding of Pandas and NumPy

Step 1: Basic Data Processing

Start with a simple data processing function:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox

def analyze_data(csv_content: str):
    """Analyze CSV data and return statistics"""
    with Sandbox.create(template="code-interpreter") as sandbox:
        # Upload data
        sandbox.files.write("/workspace/data.csv", csv_content)
        
        # Analyze
        result = sandbox.run_code("""
import pandas as pd

# Load data
df = pd.read_csv('/workspace/data.csv')

# Basic statistics
print("Dataset Shape:", df.shape)
print("\\nSummary Statistics:")
print(df.describe())

# Save results
stats = df.describe().to_dict()
import json
with open('/workspace/stats.json', 'w') as f:
    json.dump(stats, f, indent=2)
        """)
        
        # Download results
        stats = sandbox.files.read("/workspace/stats.json")
        return {
            "output": result.stdout,
            "statistics": stats
        }

# Example usage
csv_data = """name,age,score
Alice,25,95
Bob,30,87
Charlie,35,92
"""

results = analyze_data(csv_data)
print(results["output"])

Step 2: Add Visualizations

Generate plots and charts:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox
import base64

def analyze_with_visualizations(csv_content: str):
    """Analyze data and generate visualizations"""
    with Sandbox.create(template="code-interpreter") as sandbox:
        # Upload data
        sandbox.files.write("/workspace/data.csv", csv_content)
        
        # Install packages if needed
        sandbox.commands.run("pip install matplotlib seaborn --quiet")
        
        # Analyze and visualize
        result = sandbox.run_code("""
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('/workspace/data.csv')

# Set style
sns.set_style('whitegrid')

# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Data Analysis Dashboard', fontsize=16, fontweight='bold')

# Age distribution
axes[0, 0].hist(df['age'], bins=10, edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Count')

# Score distribution
axes[0, 1].hist(df['score'], bins=10, color='green', edgecolor='black')
axes[0, 1].set_title('Score Distribution')
axes[0, 1].set_xlabel('Score')
axes[0, 1].set_ylabel('Count')

# Age vs Score scatter
axes[1, 0].scatter(df['age'], df['score'], s=100, alpha=0.6)
axes[1, 0].set_title('Age vs Score')
axes[1, 0].set_xlabel('Age')
axes[1, 0].set_ylabel('Score')
axes[1, 0].grid(True, alpha=0.3)

# Correlation heatmap
if len(df.select_dtypes(include=['number']).columns) > 1:
    correlation = df.select_dtypes(include=['number']).corr()
    sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', ax=axes[1, 1])
    axes[1, 1].set_title('Correlation Matrix')

plt.tight_layout()
plt.savefig('/workspace/analysis.png', dpi=150, bbox_inches='tight')
print("✅ Visualization saved!")
        """)
        
        # Download visualization
        image_data = sandbox.files.read_bytes("/workspace/analysis.png")
        
        return {
            "output": result.stdout,
            "visualization": base64.b64encode(image_data).decode('utf-8')
        }

# Example usage
csv_data = """name,age,score
Alice,25,95
Bob,30,87
Charlie,35,92
Diana,28,98
Eve,32,89
"""

results = analyze_with_visualizations(csv_data)
# results["visualization"] contains base64-encoded PNG

Step 3: Complete Pipeline

Build a complete pipeline with multiple steps:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox
from typing import Dict, Any
import json

class DataAnalysisPipeline:
    """Complete data analysis pipeline"""
    
    def __init__(self):
        self.template = "code-interpreter"
    
    def run(self, csv_content: str) -> Dict[str, Any]:
        """Run complete analysis pipeline"""
        with Sandbox.create(template=self.template) as sandbox:
            # Step 1: Upload data
            sandbox.files.write("/workspace/data.csv", csv_content)
            
            # Step 2: Data cleaning and validation
            sandbox.run_code("""
import pandas as pd

df = pd.read_csv('/workspace/data.csv')

# Data cleaning
df = df.dropna()
df = df.drop_duplicates()

# Save cleaned data
df.to_csv('/workspace/cleaned_data.csv', index=False)
print(f"Cleaned data: {len(df)} rows")
            """)
            
            # Step 3: Statistical analysis
            stats_result = sandbox.run_code("""
import pandas as pd
import json

df = pd.read_csv('/workspace/cleaned_data.csv')

# Calculate statistics
stats = {
    'row_count': len(df),
    'column_count': len(df.columns),
    'summary': df.describe().to_dict(),
    'missing_values': df.isnull().sum().to_dict(),
    'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()}
}

# Save statistics
with open('/workspace/stats.json', 'w') as f:
    json.dump(stats, f, indent=2)

print("Statistics calculated")
            """)
            
            # Step 4: Generate visualizations
            viz_result = sandbox.run_code("""
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/workspace/cleaned_data.csv')

# Get numeric columns
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()

if len(numeric_cols) > 0:
    fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(12, 4 * len(numeric_cols)))
    if len(numeric_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(numeric_cols):
        axes[i].hist(df[col], bins=20, edgecolor='black')
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('/workspace/distributions.png', dpi=150, bbox_inches='tight')
    print("Visualizations generated")
else:
    print("No numeric columns for visualization")
            """)
            
            # Step 5: Generate report
            report_result = sandbox.run_code("""
import pandas as pd
import json

df = pd.read_csv('/workspace/cleaned_data.csv')

with open('/workspace/stats.json') as f:
    stats = json.load(f)

# Generate report
report = f"""
# Data Analysis Report

## Dataset Overview
- Rows: {stats['row_count']}
- Columns: {stats['column_count']}

## Summary Statistics
{pd.DataFrame(stats['summary']).to_markdown()}

## Data Quality
- Missing values: {sum(stats['missing_values'].values())}
"""

with open('/workspace/report.md', 'w') as f:
    f.write(report)

print("Report generated")
            """)
            
            # Download all results
            results = {
                "statistics": json.loads(sandbox.files.read("/workspace/stats.json")),
                "report": sandbox.files.read("/workspace/report.md"),
                "outputs": {
                    "cleaning": stats_result.stdout,
                    "analysis": stats_result.stdout,
                    "visualization": viz_result.stdout,
                    "report": report_result.stdout
                }
            }
            
            # Download visualization if it exists
            try:
                image_data = sandbox.files.read_bytes("/workspace/distributions.png")
                results["visualization"] = image_data
            except:
                pass
            
            return results

# Usage
pipeline = DataAnalysisPipeline()

csv_data = """name,age,score,department
Alice,25,95,Engineering
Bob,30,87,Marketing
Charlie,35,92,Engineering
Diana,28,98,Sales
Eve,32,89,Marketing
"""

results = pipeline.run(csv_data)
print(results["report"])

Best Practices

For data analysis workloads, choose templates with sufficient resources:
  • Python
  • JavaScript/TypeScript
# Choose a template with appropriate resources for data analysis
# Resources are determined by the template, not specified during creation
sandbox = Sandbox.create(template="code-interpreter")

# Check template resources
template = Sandbox.get_template("code-interpreter")
print(f"Template resources: {template.resources.vcpu} vCPU, {template.resources.memory_mb}MB RAM")
For large datasets, process data in chunks within the sandbox:
# Process large CSV in chunks
code = """
import pandas as pd

# Read CSV in chunks
chunk_size = 10000
for chunk in pd.read_csv('/workspace/large_file.csv', chunksize=chunk_size):
    # Process each chunk
    process_chunk(chunk)
"""

sandbox.run_code(code)
Save intermediate results to avoid recomputation:
# Save cleaned data
sandbox.run_code("df.to_csv('/workspace/cleaned_data.csv')")

# Use cleaned data in next step
sandbox.run_code("df = pd.read_csv('/workspace/cleaned_data.csv')")

Next Steps