Data Analysis with Pandas in Isolated Sandboxes

Pandas is the backbone of Python data analysis. Combined with HopX sandboxes, you get secure, isolated environments where untrusted data can be processed without risk to your systems.

This guide shows how to run data analysis workflows in sandboxes—from basic operations to advanced visualizations.

Why Sandbox Your Data Analysis?

Risks of local data analysis:

Malicious files could exploit pandas vulnerabilities
User-uploaded data might contain code injection
Resource exhaustion from large datasets
No isolation between analysis sessions

Sandbox benefits:

Complete isolation from your infrastructure
Resource limits prevent abuse
Each analysis runs fresh
No persistent state between runs

Getting Started

The code-interpreter template comes with pandas, numpy, matplotlib, and other data science packages pre-installed:

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Verify packages are available
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
import matplotlib
print(f'Pandas: {pd.__version__}')
print(f'NumPy: {np.__version__}')
print(f'Matplotlib: {matplotlib.__version__}')
"
""")
print(result.stdout)
 

Basic Data Operations

Loading Data

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Upload a CSV file
csv_data = """name,age,city,salary
Alice,30,New York,75000
Bob,25,San Francisco,85000
Carol,35,Chicago,65000
David,28,Boston,70000
Eve,32,Seattle,90000"""
 
sandbox.files.write("/app/employees.csv", csv_data)
 
# Load and explore
result = sandbox.commands.run("""
python -c "
import pandas as pd
 
df = pd.read_csv('/app/employees.csv')
 
print('Shape:', df.shape)
print()
print('Columns:', list(df.columns))
print()
print('Data types:')
print(df.dtypes)
print()
print('First few rows:')
print(df.head())
"
""")
print(result.stdout)
 

Data Filtering and Selection

python

from hopx import Sandbox
import json
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Upload data
sandbox.files.write("/app/data.csv", csv_data)
 
# Complex filtering
result = sandbox.commands.run("""
python -c "
import pandas as pd
import json
 
df = pd.read_csv('/app/data.csv')
 
# Filter: age > 28 AND salary > 70000
filtered = df[(df['age'] > 28) & (df['salary'] > 70000)]
 
# Select specific columns
selected = filtered[['name', 'salary']]
 
# Convert to JSON for output
print(selected.to_json(orient='records'))
"
""")
 
data = json.loads(result.stdout)
print("High earners over 28:", data)
 

Aggregations and Grouping

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Sample sales data
sales_data = """product,category,quantity,price,date
Widget A,Electronics,10,99.99,2024-01-15
Widget B,Electronics,5,149.99,2024-01-16
Gadget X,Home,20,29.99,2024-01-15
Gadget Y,Home,15,39.99,2024-01-17
Widget A,Electronics,8,99.99,2024-01-18"""
 
sandbox.files.write("/app/sales.csv", sales_data)
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
 
df = pd.read_csv('/app/sales.csv')
 
# Calculate revenue
df['revenue'] = df['quantity'] * df['price']
 
# Group by category
category_stats = df.groupby('category').agg({
    'quantity': 'sum',
    'revenue': 'sum',
    'product': 'nunique'  # Count unique products
}).rename(columns={'product': 'unique_products'})
 
print('Sales by Category:')
print(category_stats)
print()
 
# Group by product
product_stats = df.groupby('product').agg({
    'quantity': 'sum',
    'revenue': 'sum'
}).sort_values('revenue', ascending=False)
 
print('Sales by Product:')
print(product_stats)
"
""")
print(result.stdout)
 

Data Transformation

Cleaning and Preprocessing

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Messy data with issues
messy_data = """id,name,email,phone,signup_date
1,John Doe,JOHN@EXAMPLE.COM,555-1234,2024/01/15
2,jane smith,jane@test.com,,01-20-2024
3,BOB WILSON,bob@company.org,555-5678,2024-01-22
4,,missing@email.com,555-9999,2024-01-25
5,Alice Brown,alice@domain.com,N/A,"""
 
sandbox.files.write("/app/messy.csv", messy_data)
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
 
df = pd.read_csv('/app/messy.csv')
 
print('Before cleaning:')
print(df)
print()
 
# Standardize names (title case)
df['name'] = df['name'].str.title()
 
# Lowercase emails
df['email'] = df['email'].str.lower()
 
# Replace 'N/A' with NaN
df = df.replace('N/A', np.nan)
 
# Parse dates (handle multiple formats)
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
 
# Drop rows with missing critical fields
df_clean = df.dropna(subset=['name', 'email'])
 
print('After cleaning:')
print(df_clean)
print()
print(f'Rows removed: {len(df) - len(df_clean)}')
"
""")
print(result.stdout)
 

Merging Datasets

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Create two related datasets
customers = """customer_id,name,country
1,Alice,USA
2,Bob,UK
3,Carol,Canada
4,David,USA"""
 
orders = """order_id,customer_id,product,amount
101,1,Widget,99.99
102,2,Gadget,149.99
103,1,Accessory,29.99
104,3,Widget,99.99
105,5,Unknown,50.00"""  # Customer 5 doesn't exist
 
sandbox.files.write("/app/customers.csv", customers)
sandbox.files.write("/app/orders.csv", orders)
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
 
customers = pd.read_csv('/app/customers.csv')
orders = pd.read_csv('/app/orders.csv')
 
# Inner join - only matching records
inner = pd.merge(orders, customers, on='customer_id', how='inner')
print('Inner Join (matching only):')
print(inner)
print()
 
# Left join - all orders, matching customers
left = pd.merge(orders, customers, on='customer_id', how='left')
print('Left Join (all orders):')
print(left)
print()
 
# Summary by country
by_country = inner.groupby('country')['amount'].agg(['sum', 'count', 'mean'])
print('Sales by Country:')
print(by_country)
"
""")
print(result.stdout)
 

Statistical Analysis

Descriptive Statistics

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Generate sample data
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
 
# Generate sample dataset
np.random.seed(42)
n = 1000
 
df = pd.DataFrame({
    'age': np.random.normal(35, 10, n).astype(int),
    'income': np.random.lognormal(10.5, 0.5, n),
    'score': np.random.beta(2, 5, n) * 100,
    'category': np.random.choice(['A', 'B', 'C'], n)
})
 
print('Dataset Shape:', df.shape)
print()
print('Descriptive Statistics:')
print(df.describe())
print()
print('By Category:')
print(df.groupby('category').agg({
    'age': 'mean',
    'income': 'median',
    'score': ['mean', 'std']
}))
"
""")
print(result.stdout)
 

Correlation Analysis

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
 
np.random.seed(42)
n = 500
 
# Create correlated variables
x = np.random.normal(0, 1, n)
y = 0.7 * x + 0.3 * np.random.normal(0, 1, n)  # Correlated with x
z = np.random.normal(0, 1, n)  # Independent
 
df = pd.DataFrame({'x': x, 'y': y, 'z': z})
 
# Correlation matrix
print('Correlation Matrix:')
print(df.corr())
print()
 
# Spearman correlation (rank-based)
print('Spearman Correlation:')
print(df.corr(method='spearman'))
"
""")
print(result.stdout)
 

Data Visualization

Basic Charts

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
# Create visualization
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
# Sample data
categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Home']
sales = [45000, 32000, 28000, 15000, 22000]
 
# Create bar chart
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
 
# Bar chart
axes[0].bar(categories, sales, color='steelblue')
axes[0].set_title('Sales by Category')
axes[0].set_ylabel('Sales ($)')
axes[0].tick_params(axis='x', rotation=45)
 
# Pie chart
axes[1].pie(sales, labels=categories, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Sales Distribution')
 
plt.tight_layout()
plt.savefig('/app/sales_charts.png', dpi=150)
print('Charts saved to /app/sales_charts.png')
"
""")
 
# Download the chart
chart_data = sandbox.files.read("/app/sales_charts.png")
with open("sales_charts.png", "wb") as f:
    f.write(chart_data)
print("Chart downloaded!")
 

Time Series Visualization

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
 
# Generate time series data
np.random.seed(42)
dates = pd.date_range('2024-01-01', periods=365, freq='D')
base = 100 + np.cumsum(np.random.randn(365) * 2)
seasonal = 10 * np.sin(np.arange(365) * 2 * np.pi / 365)
values = base + seasonal
 
df = pd.DataFrame({'date': dates, 'value': values})
df.set_index('date', inplace=True)
 
# Calculate moving average
df['MA_7'] = df['value'].rolling(window=7).mean()
df['MA_30'] = df['value'].rolling(window=30).mean()
 
# Plot
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(df.index, df['value'], alpha=0.5, label='Daily', linewidth=0.5)
ax.plot(df.index, df['MA_7'], label='7-day MA', linewidth=1.5)
ax.plot(df.index, df['MA_30'], label='30-day MA', linewidth=2)
 
ax.set_title('Time Series with Moving Averages')
ax.set_xlabel('Date')
ax.set_ylabel('Value')
ax.legend()
ax.grid(True, alpha=0.3)
 
plt.tight_layout()
plt.savefig('/app/timeseries.png', dpi=150)
print('Time series chart saved')
"
""")
 
# Download
ts_chart = sandbox.files.read("/app/timeseries.png")
with open("timeseries.png", "wb") as f:
    f.write(ts_chart)
 

Advanced Visualizations

python

from hopx import Sandbox
 
sandbox = Sandbox.create(template="code-interpreter")
 
result = sandbox.commands.run("""
python -c "
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
# Generate sample data
np.random.seed(42)
n = 200
 
df = pd.DataFrame({
    'x': np.random.normal(0, 1, n),
    'y': np.random.normal(0, 1, n),
    'size': np.random.uniform(50, 500, n),
    'category': np.random.choice(['A', 'B', 'C'], n)
})
df['y'] = df['y'] + 0.5 * df['x']  # Add correlation
 
# Create figure with multiple plots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
 
# Scatter plot
scatter = axes[0, 0].scatter(df['x'], df['y'], c=df['category'].map({'A': 0, 'B': 1, 'C': 2}),
                             s=df['size']/5, alpha=0.6, cmap='viridis')
axes[0, 0].set_title('Scatter Plot with Size & Color')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')
 
# Histogram with KDE
for cat in ['A', 'B', 'C']:
    subset = df[df['category'] == cat]['x']
    axes[0, 1].hist(subset, bins=20, alpha=0.5, label=cat, density=True)
axes[0, 1].set_title('Distribution by Category')
axes[0, 1].legend()
 
# Box plot
df.boxplot(column='y', by='category', ax=axes[1, 0])
axes[1, 0].set_title('Box Plot by Category')
 
# Heatmap (correlation)
corr = df[['x', 'y', 'size']].corr()
im = axes[1, 1].imshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
axes[1, 1].set_xticks(range(len(corr.columns)))
axes[1, 1].set_yticks(range(len(corr.columns)))
axes[1, 1].set_xticklabels(corr.columns)
axes[1, 1].set_yticklabels(corr.columns)
axes[1, 1].set_title('Correlation Heatmap')
plt.colorbar(im, ax=axes[1, 1])
 
plt.suptitle('Data Analysis Dashboard', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('/app/dashboard.png', dpi=150, bbox_inches='tight')
print('Dashboard saved')
"
""")
 
dashboard = sandbox.files.read("/app/dashboard.png")
with open("dashboard.png", "wb") as f:
    f.write(dashboard)
 

Processing User-Uploaded Data

Here's a complete pattern for safely processing user uploads:

python

from hopx import Sandbox
import json
 
def analyze_user_csv(csv_content: bytes, analysis_request: str) -> dict:
    """
    Safely analyze user-uploaded CSV data.
    
    Args:
        csv_content: Raw CSV file content
        analysis_request: Natural language description of desired analysis
    
    Returns:
        Dictionary with analysis results and any generated charts
    """
    sandbox = Sandbox.create(template="code-interpreter")
    
    try:
        # Upload the user's data
        sandbox.files.write("/app/user_data.csv", csv_content)
        
        # First, validate the CSV
        validation = sandbox.commands.run("""
python -c "
import pandas as pd
import json
 
try:
    df = pd.read_csv('/app/user_data.csv')
    info = {
        'valid': True,
        'rows': len(df),
        'columns': list(df.columns),
        'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()}
    }
except Exception as e:
    info = {'valid': False, 'error': str(e)}
 
print(json.dumps(info))
"
""")
        
        data_info = json.loads(validation.stdout)
        
        if not data_info['valid']:
            return {'error': data_info['error']}
        
        # Perform the requested analysis
        analysis_code = generate_analysis_code(analysis_request, data_info)
        sandbox.files.write("/app/analyze.py", analysis_code)
        
        result = sandbox.commands.run("cd /app && python analyze.py")
        
        # Collect results
        output = {'data_info': data_info}
        
        if result.exit_code == 0:
            output['analysis'] = result.stdout
        else:
            output['error'] = result.stderr
        
        # Check for generated charts
        chart_check = sandbox.commands.run("ls /app/*.png 2>/dev/null || true")
        if chart_check.stdout.strip():
            charts = []
            for chart_path in chart_check.stdout.strip().split('\n'):
                chart_data = sandbox.files.read(chart_path)
                charts.append({
                    'name': chart_path.split('/')[-1],
                    'data': chart_data
                })
            output['charts'] = charts
        
        return output
    
    finally:
        sandbox.kill()
 
 
def generate_analysis_code(request: str, data_info: dict) -> str:
    """Generate pandas analysis code based on user request"""
    # This would typically use an LLM to generate code
    # For this example, we'll use a template
    
    return f'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
 
df = pd.read_csv('/app/user_data.csv')
 
# Basic analysis
print("=== Data Overview ===")
print(f"Rows: {{len(df)}}")
print(f"Columns: {{len(df.columns)}}")
print()
 
print("=== Summary Statistics ===")
print(df.describe())
print()
 
print("=== Missing Values ===")
print(df.isnull().sum())
print()
 
# Generate a basic chart for numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns[:4]  # Limit to 4
if len(numeric_cols) > 0:
    fig, axes = plt.subplots(1, len(numeric_cols), figsize=(4*len(numeric_cols), 4))
    if len(numeric_cols) == 1:
        axes = [axes]
    
    for ax, col in zip(axes, numeric_cols):
        df[col].hist(ax=ax, bins=20)
        ax.set_title(col)
    
    plt.tight_layout()
    plt.savefig('/app/histograms.png', dpi=100)
    print("\\nHistograms saved to /app/histograms.png")
'''
 
 
# Usage example
with open("user_upload.csv", "rb") as f:
    csv_content = f.read()
 
results = analyze_user_csv(csv_content, "Show me basic statistics and trends")
print(results['analysis'])
 

Performance Tips

1. Use Appropriate Data Types

python

# Convert types to reduce memory
sandbox.commands.run("""
python -c "
import pandas as pd
 
df = pd.read_csv('/app/large_data.csv')
 
# Before
print('Memory before:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
 
# Optimize types
df['category'] = df['category'].astype('category')
df['small_int'] = df['small_int'].astype('int16')
df['boolean'] = df['boolean'].astype('bool')
 
# After
print('Memory after:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
"
""")
 

2. Process Large Files in Chunks

python

sandbox.commands.run("""
python -c "
import pandas as pd
 
# Process large CSV in chunks
chunk_size = 10000
results = []
 
for chunk in pd.read_csv('/app/huge_file.csv', chunksize=chunk_size):
    # Process each chunk
    chunk_result = chunk.groupby('category')['value'].sum()
    results.append(chunk_result)
 
# Combine results
final = pd.concat(results).groupby(level=0).sum()
print(final)
"
""")
 

3. Use Efficient File Formats

python

sandbox.commands.run("""
python -c "
import pandas as pd
 
df = pd.read_csv('/app/data.csv')
 
# Save as Parquet (much faster to read)
df.to_parquet('/app/data.parquet')
 
# Read parquet (faster than CSV)
df2 = pd.read_parquet('/app/data.parquet')
"
""")
 

Conclusion

Running Pandas in HopX sandboxes gives you:

Security - User data is isolated
Reliability - Consistent environment every time
Scalability - Process many datasets in parallel
Safety - Malicious data can't escape

Whether you're building a data analysis API, processing user uploads, or running automated reports, sandboxed Pandas provides the foundation for secure data science.

Data Analysis with Pandas in Isolated Sandboxes

Data Analysis with Pandas in Isolated Sandboxes

Why Sandbox Your Data Analysis?

Getting Started

Basic Data Operations

Loading Data

Data Filtering and Selection

Aggregations and Grouping

Data Transformation

Cleaning and Preprocessing

Merging Datasets

Statistical Analysis

Descriptive Statistics

Correlation Analysis

Data Visualization

Basic Charts

Time Series Visualization

Advanced Visualizations

Processing User-Uploaded Data

Performance Tips

1. Use Appropriate Data Types

2. Process Large Files in Chunks

3. Use Efficient File Formats

Conclusion

Resources

Related articles

Microsoft Agent Framework with HopX: Secure Code Execution for AI Agents

Microsoft AutoGen with Isolated Code Execution Using HopX

CrewAI Multi-Agent Pipelines with Secure Code Execution

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Verify packages are available
6	result = sandbox.commands.run("""
7	python -c "
8	import pandas as pd
9	import numpy as np
10	import matplotlib
11	print(f'Pandas: {pd.__version__}')
12	print(f'NumPy: {np.__version__}')
13	print(f'Matplotlib: {matplotlib.__version__}')
14	"
15	""")
16	print(result.stdout)
17

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Upload a CSV file
6	csv_data = """name,age,city,salary
7	Alice,30,New York,75000
8	Bob,25,San Francisco,85000
9	Carol,35,Chicago,65000
10	David,28,Boston,70000
11	Eve,32,Seattle,90000"""
12
13	sandbox.files.write("/app/employees.csv", csv_data)
14
15	# Load and explore
16	result = sandbox.commands.run("""
17	python -c "
18	import pandas as pd
19
20	df = pd.read_csv('/app/employees.csv')
21
22	print('Shape:', df.shape)
23	print()
24	print('Columns:', list(df.columns))
25	print()
26	print('Data types:')
27	print(df.dtypes)
28	print()
29	print('First few rows:')
30	print(df.head())
31	"
32	""")
33	print(result.stdout)
34

1	from hopx import Sandbox
2	import json
3
4	sandbox = Sandbox.create(template="code-interpreter")
5
6	# Upload data
7	sandbox.files.write("/app/data.csv", csv_data)
8
9	# Complex filtering
10	result = sandbox.commands.run("""
11	python -c "
12	import pandas as pd
13	import json
14
15	df = pd.read_csv('/app/data.csv')
16
17	# Filter: age > 28 AND salary > 70000
18	filtered = df[(df['age'] > 28) & (df['salary'] > 70000)]
19
20	# Select specific columns
21	selected = filtered[['name', 'salary']]
22
23	# Convert to JSON for output
24	print(selected.to_json(orient='records'))
25	"
26	""")
27
28	data = json.loads(result.stdout)
29	print("High earners over 28:", data)
30

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Sample sales data
6	sales_data = """product,category,quantity,price,date
7	Widget A,Electronics,10,99.99,2024-01-15
8	Widget B,Electronics,5,149.99,2024-01-16
9	Gadget X,Home,20,29.99,2024-01-15
10	Gadget Y,Home,15,39.99,2024-01-17
11	Widget A,Electronics,8,99.99,2024-01-18"""
12
13	sandbox.files.write("/app/sales.csv", sales_data)
14
15	result = sandbox.commands.run("""
16	python -c "
17	import pandas as pd
18
19	df = pd.read_csv('/app/sales.csv')
20
21	# Calculate revenue
22	df['revenue'] = df['quantity'] * df['price']
23
24	# Group by category
25	category_stats = df.groupby('category').agg({
26	'quantity': 'sum',
27	'revenue': 'sum',
28	'product': 'nunique' # Count unique products
29	}).rename(columns={'product': 'unique_products'})
30
31	print('Sales by Category:')
32	print(category_stats)
33	print()
34
35	# Group by product
36	product_stats = df.groupby('product').agg({
37	'quantity': 'sum',
38	'revenue': 'sum'
39	}).sort_values('revenue', ascending=False)
40
41	print('Sales by Product:')
42	print(product_stats)
43	"
44	""")
45	print(result.stdout)
46

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Messy data with issues
6	messy_data = """id,name,email,phone,signup_date
7	1,John Doe,JOHN@EXAMPLE.COM,555-1234,2024/01/15
8	2,jane smith,jane@test.com,,01-20-2024
9	3,BOB WILSON,bob@company.org,555-5678,2024-01-22
10	4,,missing@email.com,555-9999,2024-01-25
11	5,Alice Brown,alice@domain.com,N/A,"""
12
13	sandbox.files.write("/app/messy.csv", messy_data)
14
15	result = sandbox.commands.run("""
16	python -c "
17	import pandas as pd
18	import numpy as np
19
20	df = pd.read_csv('/app/messy.csv')
21
22	print('Before cleaning:')
23	print(df)
24	print()
25
26	# Standardize names (title case)
27	df['name'] = df['name'].str.title()
28
29	# Lowercase emails
30	df['email'] = df['email'].str.lower()
31
32	# Replace 'N/A' with NaN
33	df = df.replace('N/A', np.nan)
34
35	# Parse dates (handle multiple formats)
36	df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
37
38	# Drop rows with missing critical fields
39	df_clean = df.dropna(subset=['name', 'email'])
40
41	print('After cleaning:')
42	print(df_clean)
43	print()
44	print(f'Rows removed: {len(df) - len(df_clean)}')
45	"
46	""")
47	print(result.stdout)
48

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Create two related datasets
6	customers = """customer_id,name,country
7	1,Alice,USA
8	2,Bob,UK
9	3,Carol,Canada
10	4,David,USA"""
11
12	orders = """order_id,customer_id,product,amount
13	101,1,Widget,99.99
14	102,2,Gadget,149.99
15	103,1,Accessory,29.99
16	104,3,Widget,99.99
17	105,5,Unknown,50.00""" # Customer 5 doesn't exist
18
19	sandbox.files.write("/app/customers.csv", customers)
20	sandbox.files.write("/app/orders.csv", orders)
21
22	result = sandbox.commands.run("""
23	python -c "
24	import pandas as pd
25
26	customers = pd.read_csv('/app/customers.csv')
27	orders = pd.read_csv('/app/orders.csv')
28
29	# Inner join - only matching records
30	inner = pd.merge(orders, customers, on='customer_id', how='inner')
31	print('Inner Join (matching only):')
32	print(inner)
33	print()
34
35	# Left join - all orders, matching customers
36	left = pd.merge(orders, customers, on='customer_id', how='left')
37	print('Left Join (all orders):')
38	print(left)
39	print()
40
41	# Summary by country
42	by_country = inner.groupby('country')['amount'].agg(['sum', 'count', 'mean'])
43	print('Sales by Country:')
44	print(by_country)
45	"
46	""")
47	print(result.stdout)
48

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Generate sample data
6	result = sandbox.commands.run("""
7	python -c "
8	import pandas as pd
9	import numpy as np
10
11	# Generate sample dataset
12	np.random.seed(42)
13	n = 1000
14
15	df = pd.DataFrame({
16	'age': np.random.normal(35, 10, n).astype(int),
17	'income': np.random.lognormal(10.5, 0.5, n),
18	'score': np.random.beta(2, 5, n) * 100,
19	'category': np.random.choice(['A', 'B', 'C'], n)
20	})
21
22	print('Dataset Shape:', df.shape)
23	print()
24	print('Descriptive Statistics:')
25	print(df.describe())
26	print()
27	print('By Category:')
28	print(df.groupby('category').agg({
29	'age': 'mean',
30	'income': 'median',
31	'score': ['mean', 'std']
32	}))
33	"
34	""")
35	print(result.stdout)
36

1	from hopx import Sandbox
2
3	sandbox = Sandbox.create(template="code-interpreter")
4
5	# Create visualization
6	result = sandbox.commands.run("""
7	python -c "
8	import pandas as pd
9	import numpy as np
10	import matplotlib.pyplot as plt
11
12	# Sample data
13	categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Home']
14	sales = [45000, 32000, 28000, 15000, 22000]
15
16	# Create bar chart
17	fig, axes = plt.subplots(1, 2, figsize=(12, 5))
18
19	# Bar chart
20	axes[0].bar(categories, sales, color='steelblue')
21	axes[0].set_title('Sales by Category')
22	axes[0].set_ylabel('Sales ($)')
23	axes[0].tick_params(axis='x', rotation=45)
24
25	# Pie chart
26	axes[1].pie(sales, labels=categories, autopct='%1.1f%%', startangle=90)
27	axes[1].set_title('Sales Distribution')
28
29	plt.tight_layout()
30	plt.savefig('/app/sales_charts.png', dpi=150)
31	print('Charts saved to /app/sales_charts.png')
32	"
33	""")
34
35	# Download the chart
36	chart_data = sandbox.files.read("/app/sales_charts.png")
37	with open("sales_charts.png", "wb") as f:
38	f.write(chart_data)
39	print("Chart downloaded!")
40

1	from hopx import Sandbox
2	import json
3
4	def analyze_user_csv(csv_content: bytes, analysis_request: str) -> dict:
5	"""
6	Safely analyze user-uploaded CSV data.
7
8	Args:
9	csv_content: Raw CSV file content
10	analysis_request: Natural language description of desired analysis
11
12	Returns:
13	Dictionary with analysis results and any generated charts
14	"""
15	sandbox = Sandbox.create(template="code-interpreter")
16
17	try:
18	# Upload the user's data
19	sandbox.files.write("/app/user_data.csv", csv_content)
20
21	# First, validate the CSV
22	validation = sandbox.commands.run("""
23	python -c "
24	import pandas as pd
25	import json
26
27	try:
28	df = pd.read_csv('/app/user_data.csv')
29	info = {
30	'valid': True,
31	'rows': len(df),
32	'columns': list(df.columns),
33	'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()}
34	}
35	except Exception as e:
36	info = {'valid': False, 'error': str(e)}
37
38	print(json.dumps(info))
39	"
40	""")
41
42	data_info = json.loads(validation.stdout)
43
44	if not data_info['valid']:
45	return {'error': data_info['error']}
46
47	# Perform the requested analysis
48	analysis_code = generate_analysis_code(analysis_request, data_info)
49	sandbox.files.write("/app/analyze.py", analysis_code)
50
51	result = sandbox.commands.run("cd /app && python analyze.py")
52
53	# Collect results
54	output = {'data_info': data_info}
55
56	if result.exit_code == 0:
57	output['analysis'] = result.stdout
58	else:
59	output['error'] = result.stderr
60
61	# Check for generated charts
62	chart_check = sandbox.commands.run("ls /app/*.png 2>/dev/null \|\| true")
63	if chart_check.stdout.strip():
64	charts = []
65	for chart_path in chart_check.stdout.strip().split('\n'):
66	chart_data = sandbox.files.read(chart_path)
67	charts.append({
68	'name': chart_path.split('/')[-1],
69	'data': chart_data
70	})
71	output['charts'] = charts
72
73	return output
74
75	finally:
76	sandbox.kill()
77
78
79	def generate_analysis_code(request: str, data_info: dict) -> str:
80	"""Generate pandas analysis code based on user request"""
81	# This would typically use an LLM to generate code
82	# For this example, we'll use a template
83
84	return f'''
85	import pandas as pd
86	import numpy as np
87	import matplotlib.pyplot as plt
88	import json
89
90	df = pd.read_csv('/app/user_data.csv')
91
92	# Basic analysis
93	print("=== Data Overview ===")
94	print(f"Rows: {{len(df)}}")
95	print(f"Columns: {{len(df.columns)}}")
96	print()
97
98	print("=== Summary Statistics ===")
99	print(df.describe())
100	print()
101
102	print("=== Missing Values ===")
103	print(df.isnull().sum())
104	print()
105
106	# Generate a basic chart for numeric columns
107	numeric_cols = df.select_dtypes(include=[np.number]).columns[:4] # Limit to 4
108	if len(numeric_cols) > 0:
109	fig, axes = plt.subplots(1, len(numeric_cols), figsize=(4*len(numeric_cols), 4))
110	if len(numeric_cols) == 1:
111	axes = [axes]
112
113	for ax, col in zip(axes, numeric_cols):
114	df[col].hist(ax=ax, bins=20)
115	ax.set_title(col)
116
117	plt.tight_layout()
118	plt.savefig('/app/histograms.png', dpi=100)
119	print("\\nHistograms saved to /app/histograms.png")
120	'''
121
122
123	# Usage example
124	with open("user_upload.csv", "rb") as f:
125	csv_content = f.read()
126
127	results = analyze_user_csv(csv_content, "Show me basic statistics and trends")
128	print(results['analysis'])
129

1	# Convert types to reduce memory
2	sandbox.commands.run("""
3	python -c "
4	import pandas as pd
5
6	df = pd.read_csv('/app/large_data.csv')
7
8	# Before
9	print('Memory before:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
10
11	# Optimize types
12	df['category'] = df['category'].astype('category')
13	df['small_int'] = df['small_int'].astype('int16')
14	df['boolean'] = df['boolean'].astype('bool')
15
16	# After
17	print('Memory after:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
18	"
19	""")
20