Basic Web Scraping
Scrape a simple webpage:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
def scrape_website(url: str):
"""Scrape content from a website"""
with Sandbox.create(template="code-interpreter") as sandbox:
# Install requests and BeautifulSoup
sandbox.commands.run("pip install requests beautifulsoup4 --quiet")
result = sandbox.run_code(f"""
import requests
from bs4 import BeautifulSoup
# Fetch webpage
response = requests.get('{url}')
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Extract data
title = soup.find('title')
title_text = title.text if title else 'No title found'
# Extract all links
links = [a.get('href') for a in soup.find_all('a', href=True)]
# Extract all paragraphs
paragraphs = [p.text.strip() for p in soup.find_all('p')]
# Save results
import json
data = {{
'url': '{url}',
'title': title_text,
'links_count': len(links),
'links': links[:10], # First 10 links
'paragraphs_count': len(paragraphs),
'paragraphs': paragraphs[:5] # First 5 paragraphs
}}
with open('/workspace/scraped_data.json', 'w') as f:
json.dump(data, f, indent=2)
print(f"Scraped {url}")
print(f"Title: {title_text}")
print(f"Found {len(links)} links and {len(paragraphs)} paragraphs")
""")
# Download results
data_json = sandbox.files.read("/workspace/scraped_data.json")
import json
return json.loads(data_json)
# Example usage
data = scrape_website("https://example.com")
print(f"Title: {data['title']}")
print(f"Links: {data['links_count']}")
Scraping Multiple Pages
Scrape multiple pages in parallel:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
import concurrent.futures
def scrape_multiple_pages(urls: list):
"""Scrape multiple pages"""
def scrape_single(url: str):
with Sandbox.create(template="code-interpreter") as sandbox:
sandbox.commands.run("pip install requests beautifulsoup4 --quiet")
result = sandbox.run_code(f"""
import requests
from bs4 import BeautifulSoup
import json
try:
response = requests.get('{url}', timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title')
title_text = title.text if title else 'No title'
data = {{
'url': '{url}',
'title': title_text,
'status': 'success'
}}
except Exception as e:
data = {{
'url': '{url}',
'status': 'error',
'error': str(e)
}}
with open('/workspace/result.json', 'w') as f:
json.dump(data, f)
""")
data_json = sandbox.files.read("/workspace/result.json")
import json
return json.loads(data_json)
# Scrape in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
results = list(executor.map(scrape_single, urls))
return results
# Example
urls = [
"https://example.com",
"https://httpbin.org/html",
"https://jsonplaceholder.typicode.com"
]
results = scrape_multiple_pages(urls)
for result in results:
print(f"{result['url']}: {result.get('title', result.get('error'))}")
API Data Extraction
Extract data from APIs:- Python
- JavaScript/TypeScript
Copy
from hopx_ai import Sandbox
import json
def extract_api_data(api_url: str):
"""Extract and process data from an API"""
with Sandbox.create(template="code-interpreter") as sandbox:
result = sandbox.run_code(f"""
import requests
import json
import pandas as pd
# Fetch data from API
response = requests.get('{api_url}')
response.raise_for_status()
data = response.json()
# Process data
if isinstance(data, list):
df = pd.DataFrame(data)
# Analysis
summary = {{
'total_records': len(df),
'columns': list(df.columns),
'summary_stats': df.describe().to_dict() if len(df.select_dtypes(include=['number']).columns) > 0 else None
}}
else:
summary = {{
'type': type(data).__name__,
'keys': list(data.keys()) if isinstance(data, dict) else None
}}
# Save results
with open('/workspace/api_data.json', 'w') as f:
json.dump(data, f, indent=2)
with open('/workspace/summary.json', 'w') as f:
json.dump(summary, f, indent=2)
print(f"Extracted {len(data) if isinstance(data, list) else 1} records")
""")
# Download results
summary_json = sandbox.files.read("/workspace/summary.json")
return json.loads(summary_json)
# Example
summary = extract_api_data("https://jsonplaceholder.typicode.com/posts")
print(f"Total records: {summary['total_records']}")
print(f"Columns: {summary['columns']}")

