Skip to main content
Learn how to build web scraping workflows using Hopx Sandboxes for safe, isolated web scraping operations.

Basic Web Scraping

Scrape a simple webpage:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox

def scrape_website(url: str):
    """Scrape content from a website"""
    with Sandbox.create(template="code-interpreter") as sandbox:
        # Install requests and BeautifulSoup
        sandbox.commands.run("pip install requests beautifulsoup4 --quiet")
        
        result = sandbox.run_code(f"""
import requests
from bs4 import BeautifulSoup

# Fetch webpage
response = requests.get('{url}')
response.raise_for_status()

# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')

# Extract data
title = soup.find('title')
title_text = title.text if title else 'No title found'

# Extract all links
links = [a.get('href') for a in soup.find_all('a', href=True)]

# Extract all paragraphs
paragraphs = [p.text.strip() for p in soup.find_all('p')]

# Save results
import json
data = {{
    'url': '{url}',
    'title': title_text,
    'links_count': len(links),
    'links': links[:10],  # First 10 links
    'paragraphs_count': len(paragraphs),
    'paragraphs': paragraphs[:5]  # First 5 paragraphs
}}

with open('/workspace/scraped_data.json', 'w') as f:
    json.dump(data, f, indent=2)

print(f"Scraped {url}")
print(f"Title: {title_text}")
print(f"Found {len(links)} links and {len(paragraphs)} paragraphs")
        """)
        
        # Download results
        data_json = sandbox.files.read("/workspace/scraped_data.json")
        import json
        return json.loads(data_json)

# Example usage
data = scrape_website("https://example.com")
print(f"Title: {data['title']}")
print(f"Links: {data['links_count']}")

Scraping Multiple Pages

Scrape multiple pages in parallel:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox
import concurrent.futures

def scrape_multiple_pages(urls: list):
    """Scrape multiple pages"""
    def scrape_single(url: str):
        with Sandbox.create(template="code-interpreter") as sandbox:
            sandbox.commands.run("pip install requests beautifulsoup4 --quiet")
            
            result = sandbox.run_code(f"""
import requests
from bs4 import BeautifulSoup
import json

try:
    response = requests.get('{url}', timeout=10)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('title')
    title_text = title.text if title else 'No title'
    
    data = {{
        'url': '{url}',
        'title': title_text,
        'status': 'success'
    }}
except Exception as e:
    data = {{
        'url': '{url}',
        'status': 'error',
        'error': str(e)
    }}

with open('/workspace/result.json', 'w') as f:
    json.dump(data, f)
            """)
            
            data_json = sandbox.files.read("/workspace/result.json")
            import json
            return json.loads(data_json)
    
    # Scrape in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        results = list(executor.map(scrape_single, urls))
    
    return results

# Example
urls = [
    "https://example.com",
    "https://httpbin.org/html",
    "https://jsonplaceholder.typicode.com"
]

results = scrape_multiple_pages(urls)
for result in results:
    print(f"{result['url']}: {result.get('title', result.get('error'))}")

API Data Extraction

Extract data from APIs:
  • Python
  • JavaScript/TypeScript
from hopx_ai import Sandbox
import json

def extract_api_data(api_url: str):
    """Extract and process data from an API"""
    with Sandbox.create(template="code-interpreter") as sandbox:
        result = sandbox.run_code(f"""
import requests
import json
import pandas as pd

# Fetch data from API
response = requests.get('{api_url}')
response.raise_for_status()

data = response.json()

# Process data
if isinstance(data, list):
    df = pd.DataFrame(data)
    
    # Analysis
    summary = {{
        'total_records': len(df),
        'columns': list(df.columns),
        'summary_stats': df.describe().to_dict() if len(df.select_dtypes(include=['number']).columns) > 0 else None
    }}
else:
    summary = {{
        'type': type(data).__name__,
        'keys': list(data.keys()) if isinstance(data, dict) else None
    }}

# Save results
with open('/workspace/api_data.json', 'w') as f:
    json.dump(data, f, indent=2)

with open('/workspace/summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Extracted {len(data) if isinstance(data, list) else 1} records")
        """)
        
        # Download results
        summary_json = sandbox.files.read("/workspace/summary.json")
        return json.loads(summary_json)

# Example
summary = extract_api_data("https://jsonplaceholder.typicode.com/posts")
print(f"Total records: {summary['total_records']}")
print(f"Columns: {summary['columns']}")

Next Steps