Back to Blog

Data Analysis with Pandas in Isolated Sandboxes

TutorialsAmin Al Ali Al Darwish10 min read

Data Analysis with Pandas in Isolated Sandboxes

Pandas is the backbone of Python data analysis. Combined with HopX sandboxes, you get secure, isolated environments where untrusted data can be processed without risk to your systems.

This guide shows how to run data analysis workflows in sandboxes—from basic operations to advanced visualizations.

Why Sandbox Your Data Analysis?

Risks of local data analysis:

  • Malicious files could exploit pandas vulnerabilities
  • User-uploaded data might contain code injection
  • Resource exhaustion from large datasets
  • No isolation between analysis sessions

Sandbox benefits:

  • Complete isolation from your infrastructure
  • Resource limits prevent abuse
  • Each analysis runs fresh
  • No persistent state between runs

Getting Started

The code-interpreter template comes with pandas, numpy, matplotlib, and other data science packages pre-installed:

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Verify packages are available
6
result = sandbox.commands.run("""
7
python -c "
8
import pandas as pd
9
import numpy as np
10
import matplotlib
11
print(f'Pandas: {pd.__version__}')
12
print(f'NumPy: {np.__version__}')
13
print(f'Matplotlib: {matplotlib.__version__}')
14
"
15
""")
16
print(result.stdout)
17
 

Basic Data Operations

Loading Data

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Upload a CSV file
6
csv_data = """name,age,city,salary
7
Alice,30,New York,75000
8
Bob,25,San Francisco,85000
9
Carol,35,Chicago,65000
10
David,28,Boston,70000
11
Eve,32,Seattle,90000"""
12
 
13
sandbox.files.write("/app/employees.csv", csv_data)
14
 
15
# Load and explore
16
result = sandbox.commands.run("""
17
python -c "
18
import pandas as pd
19
 
20
df = pd.read_csv('/app/employees.csv')
21
 
22
print('Shape:', df.shape)
23
print()
24
print('Columns:', list(df.columns))
25
print()
26
print('Data types:')
27
print(df.dtypes)
28
print()
29
print('First few rows:')
30
print(df.head())
31
"
32
""")
33
print(result.stdout)
34
 

Data Filtering and Selection

python
1
from hopx import Sandbox
2
import json
3
 
4
sandbox = Sandbox.create(template="code-interpreter")
5
 
6
# Upload data
7
sandbox.files.write("/app/data.csv", csv_data)
8
 
9
# Complex filtering
10
result = sandbox.commands.run("""
11
python -c "
12
import pandas as pd
13
import json
14
 
15
df = pd.read_csv('/app/data.csv')
16
 
17
# Filter: age > 28 AND salary > 70000
18
filtered = df[(df['age'] > 28) & (df['salary'] > 70000)]
19
 
20
# Select specific columns
21
selected = filtered[['name', 'salary']]
22
 
23
# Convert to JSON for output
24
print(selected.to_json(orient='records'))
25
"
26
""")
27
 
28
data = json.loads(result.stdout)
29
print("High earners over 28:", data)
30
 

Aggregations and Grouping

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Sample sales data
6
sales_data = """product,category,quantity,price,date
7
Widget A,Electronics,10,99.99,2024-01-15
8
Widget B,Electronics,5,149.99,2024-01-16
9
Gadget X,Home,20,29.99,2024-01-15
10
Gadget Y,Home,15,39.99,2024-01-17
11
Widget A,Electronics,8,99.99,2024-01-18"""
12
 
13
sandbox.files.write("/app/sales.csv", sales_data)
14
 
15
result = sandbox.commands.run("""
16
python -c "
17
import pandas as pd
18
 
19
df = pd.read_csv('/app/sales.csv')
20
 
21
# Calculate revenue
22
df['revenue'] = df['quantity'] * df['price']
23
 
24
# Group by category
25
category_stats = df.groupby('category').agg({
26
    'quantity': 'sum',
27
    'revenue': 'sum',
28
    'product': 'nunique'  # Count unique products
29
}).rename(columns={'product': 'unique_products'})
30
 
31
print('Sales by Category:')
32
print(category_stats)
33
print()
34
 
35
# Group by product
36
product_stats = df.groupby('product').agg({
37
    'quantity': 'sum',
38
    'revenue': 'sum'
39
}).sort_values('revenue', ascending=False)
40
 
41
print('Sales by Product:')
42
print(product_stats)
43
"
44
""")
45
print(result.stdout)
46
 

Data Transformation

Cleaning and Preprocessing

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Messy data with issues
6
messy_data = """id,name,email,phone,signup_date
7
1,John Doe,JOHN@EXAMPLE.COM,555-1234,2024/01/15
8
2,jane smith,jane@test.com,,01-20-2024
9
3,BOB WILSON,bob@company.org,555-5678,2024-01-22
10
4,,missing@email.com,555-9999,2024-01-25
11
5,Alice Brown,alice@domain.com,N/A,"""
12
 
13
sandbox.files.write("/app/messy.csv", messy_data)
14
 
15
result = sandbox.commands.run("""
16
python -c "
17
import pandas as pd
18
import numpy as np
19
 
20
df = pd.read_csv('/app/messy.csv')
21
 
22
print('Before cleaning:')
23
print(df)
24
print()
25
 
26
# Standardize names (title case)
27
df['name'] = df['name'].str.title()
28
 
29
# Lowercase emails
30
df['email'] = df['email'].str.lower()
31
 
32
# Replace 'N/A' with NaN
33
df = df.replace('N/A', np.nan)
34
 
35
# Parse dates (handle multiple formats)
36
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')
37
 
38
# Drop rows with missing critical fields
39
df_clean = df.dropna(subset=['name', 'email'])
40
 
41
print('After cleaning:')
42
print(df_clean)
43
print()
44
print(f'Rows removed: {len(df) - len(df_clean)}')
45
"
46
""")
47
print(result.stdout)
48
 

Merging Datasets

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Create two related datasets
6
customers = """customer_id,name,country
7
1,Alice,USA
8
2,Bob,UK
9
3,Carol,Canada
10
4,David,USA"""
11
 
12
orders = """order_id,customer_id,product,amount
13
101,1,Widget,99.99
14
102,2,Gadget,149.99
15
103,1,Accessory,29.99
16
104,3,Widget,99.99
17
105,5,Unknown,50.00"""  # Customer 5 doesn't exist
18
 
19
sandbox.files.write("/app/customers.csv", customers)
20
sandbox.files.write("/app/orders.csv", orders)
21
 
22
result = sandbox.commands.run("""
23
python -c "
24
import pandas as pd
25
 
26
customers = pd.read_csv('/app/customers.csv')
27
orders = pd.read_csv('/app/orders.csv')
28
 
29
# Inner join - only matching records
30
inner = pd.merge(orders, customers, on='customer_id', how='inner')
31
print('Inner Join (matching only):')
32
print(inner)
33
print()
34
 
35
# Left join - all orders, matching customers
36
left = pd.merge(orders, customers, on='customer_id', how='left')
37
print('Left Join (all orders):')
38
print(left)
39
print()
40
 
41
# Summary by country
42
by_country = inner.groupby('country')['amount'].agg(['sum', 'count', 'mean'])
43
print('Sales by Country:')
44
print(by_country)
45
"
46
""")
47
print(result.stdout)
48
 

Statistical Analysis

Descriptive Statistics

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Generate sample data
6
result = sandbox.commands.run("""
7
python -c "
8
import pandas as pd
9
import numpy as np
10
 
11
# Generate sample dataset
12
np.random.seed(42)
13
n = 1000
14
 
15
df = pd.DataFrame({
16
    'age': np.random.normal(35, 10, n).astype(int),
17
    'income': np.random.lognormal(10.5, 0.5, n),
18
    'score': np.random.beta(2, 5, n) * 100,
19
    'category': np.random.choice(['A', 'B', 'C'], n)
20
})
21
 
22
print('Dataset Shape:', df.shape)
23
print()
24
print('Descriptive Statistics:')
25
print(df.describe())
26
print()
27
print('By Category:')
28
print(df.groupby('category').agg({
29
    'age': 'mean',
30
    'income': 'median',
31
    'score': ['mean', 'std']
32
}))
33
"
34
""")
35
print(result.stdout)
36
 

Correlation Analysis

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
result = sandbox.commands.run("""
6
python -c "
7
import pandas as pd
8
import numpy as np
9
 
10
np.random.seed(42)
11
n = 500
12
 
13
# Create correlated variables
14
x = np.random.normal(0, 1, n)
15
y = 0.7 * x + 0.3 * np.random.normal(0, 1, n)  # Correlated with x
16
z = np.random.normal(0, 1, n)  # Independent
17
 
18
df = pd.DataFrame({'x': x, 'y': y, 'z': z})
19
 
20
# Correlation matrix
21
print('Correlation Matrix:')
22
print(df.corr())
23
print()
24
 
25
# Spearman correlation (rank-based)
26
print('Spearman Correlation:')
27
print(df.corr(method='spearman'))
28
"
29
""")
30
print(result.stdout)
31
 

Data Visualization

Basic Charts

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
# Create visualization
6
result = sandbox.commands.run("""
7
python -c "
8
import pandas as pd
9
import numpy as np
10
import matplotlib.pyplot as plt
11
 
12
# Sample data
13
categories = ['Electronics', 'Clothing', 'Food', 'Books', 'Home']
14
sales = [45000, 32000, 28000, 15000, 22000]
15
 
16
# Create bar chart
17
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
18
 
19
# Bar chart
20
axes[0].bar(categories, sales, color='steelblue')
21
axes[0].set_title('Sales by Category')
22
axes[0].set_ylabel('Sales ($)')
23
axes[0].tick_params(axis='x', rotation=45)
24
 
25
# Pie chart
26
axes[1].pie(sales, labels=categories, autopct='%1.1f%%', startangle=90)
27
axes[1].set_title('Sales Distribution')
28
 
29
plt.tight_layout()
30
plt.savefig('/app/sales_charts.png', dpi=150)
31
print('Charts saved to /app/sales_charts.png')
32
"
33
""")
34
 
35
# Download the chart
36
chart_data = sandbox.files.read("/app/sales_charts.png")
37
with open("sales_charts.png", "wb") as f:
38
    f.write(chart_data)
39
print("Chart downloaded!")
40
 

Time Series Visualization

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
result = sandbox.commands.run("""
6
python -c "
7
import pandas as pd
8
import numpy as np
9
import matplotlib.pyplot as plt
10
 
11
# Generate time series data
12
np.random.seed(42)
13
dates = pd.date_range('2024-01-01', periods=365, freq='D')
14
base = 100 + np.cumsum(np.random.randn(365) * 2)
15
seasonal = 10 * np.sin(np.arange(365) * 2 * np.pi / 365)
16
values = base + seasonal
17
 
18
df = pd.DataFrame({'date': dates, 'value': values})
19
df.set_index('date', inplace=True)
20
 
21
# Calculate moving average
22
df['MA_7'] = df['value'].rolling(window=7).mean()
23
df['MA_30'] = df['value'].rolling(window=30).mean()
24
 
25
# Plot
26
fig, ax = plt.subplots(figsize=(14, 6))
27
ax.plot(df.index, df['value'], alpha=0.5, label='Daily', linewidth=0.5)
28
ax.plot(df.index, df['MA_7'], label='7-day MA', linewidth=1.5)
29
ax.plot(df.index, df['MA_30'], label='30-day MA', linewidth=2)
30
 
31
ax.set_title('Time Series with Moving Averages')
32
ax.set_xlabel('Date')
33
ax.set_ylabel('Value')
34
ax.legend()
35
ax.grid(True, alpha=0.3)
36
 
37
plt.tight_layout()
38
plt.savefig('/app/timeseries.png', dpi=150)
39
print('Time series chart saved')
40
"
41
""")
42
 
43
# Download
44
ts_chart = sandbox.files.read("/app/timeseries.png")
45
with open("timeseries.png", "wb") as f:
46
    f.write(ts_chart)
47
 

Advanced Visualizations

python
1
from hopx import Sandbox
2
 
3
sandbox = Sandbox.create(template="code-interpreter")
4
 
5
result = sandbox.commands.run("""
6
python -c "
7
import pandas as pd
8
import numpy as np
9
import matplotlib.pyplot as plt
10
import seaborn as sns
11
 
12
# Generate sample data
13
np.random.seed(42)
14
n = 200
15
 
16
df = pd.DataFrame({
17
    'x': np.random.normal(0, 1, n),
18
    'y': np.random.normal(0, 1, n),
19
    'size': np.random.uniform(50, 500, n),
20
    'category': np.random.choice(['A', 'B', 'C'], n)
21
})
22
df['y'] = df['y'] + 0.5 * df['x']  # Add correlation
23
 
24
# Create figure with multiple plots
25
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
26
 
27
# Scatter plot
28
scatter = axes[0, 0].scatter(df['x'], df['y'], c=df['category'].map({'A': 0, 'B': 1, 'C': 2}),
29
                             s=df['size']/5, alpha=0.6, cmap='viridis')
30
axes[0, 0].set_title('Scatter Plot with Size & Color')
31
axes[0, 0].set_xlabel('X')
32
axes[0, 0].set_ylabel('Y')
33
 
34
# Histogram with KDE
35
for cat in ['A', 'B', 'C']:
36
    subset = df[df['category'] == cat]['x']
37
    axes[0, 1].hist(subset, bins=20, alpha=0.5, label=cat, density=True)
38
axes[0, 1].set_title('Distribution by Category')
39
axes[0, 1].legend()
40
 
41
# Box plot
42
df.boxplot(column='y', by='category', ax=axes[1, 0])
43
axes[1, 0].set_title('Box Plot by Category')
44
 
45
# Heatmap (correlation)
46
corr = df[['x', 'y', 'size']].corr()
47
im = axes[1, 1].imshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
48
axes[1, 1].set_xticks(range(len(corr.columns)))
49
axes[1, 1].set_yticks(range(len(corr.columns)))
50
axes[1, 1].set_xticklabels(corr.columns)
51
axes[1, 1].set_yticklabels(corr.columns)
52
axes[1, 1].set_title('Correlation Heatmap')
53
plt.colorbar(im, ax=axes[1, 1])
54
 
55
plt.suptitle('Data Analysis Dashboard', fontsize=14, y=1.02)
56
plt.tight_layout()
57
plt.savefig('/app/dashboard.png', dpi=150, bbox_inches='tight')
58
print('Dashboard saved')
59
"
60
""")
61
 
62
dashboard = sandbox.files.read("/app/dashboard.png")
63
with open("dashboard.png", "wb") as f:
64
    f.write(dashboard)
65
 

Processing User-Uploaded Data

Here's a complete pattern for safely processing user uploads:

python
1
from hopx import Sandbox
2
import json
3
 
4
def analyze_user_csv(csv_content: bytes, analysis_request: str) -> dict:
5
    """
6
    Safely analyze user-uploaded CSV data.
7
    
8
    Args:
9
        csv_content: Raw CSV file content
10
        analysis_request: Natural language description of desired analysis
11
    
12
    Returns:
13
        Dictionary with analysis results and any generated charts
14
    """
15
    sandbox = Sandbox.create(template="code-interpreter")
16
    
17
    try:
18
        # Upload the user's data
19
        sandbox.files.write("/app/user_data.csv", csv_content)
20
        
21
        # First, validate the CSV
22
        validation = sandbox.commands.run("""
23
python -c "
24
import pandas as pd
25
import json
26
 
27
try:
28
    df = pd.read_csv('/app/user_data.csv')
29
    info = {
30
        'valid': True,
31
        'rows': len(df),
32
        'columns': list(df.columns),
33
        'dtypes': {col: str(dtype) for col, dtype in df.dtypes.items()}
34
    }
35
except Exception as e:
36
    info = {'valid': False, 'error': str(e)}
37
 
38
print(json.dumps(info))
39
"
40
""")
41
        
42
        data_info = json.loads(validation.stdout)
43
        
44
        if not data_info['valid']:
45
            return {'error': data_info['error']}
46
        
47
        # Perform the requested analysis
48
        analysis_code = generate_analysis_code(analysis_request, data_info)
49
        sandbox.files.write("/app/analyze.py", analysis_code)
50
        
51
        result = sandbox.commands.run("cd /app && python analyze.py")
52
        
53
        # Collect results
54
        output = {'data_info': data_info}
55
        
56
        if result.exit_code == 0:
57
            output['analysis'] = result.stdout
58
        else:
59
            output['error'] = result.stderr
60
        
61
        # Check for generated charts
62
        chart_check = sandbox.commands.run("ls /app/*.png 2>/dev/null || true")
63
        if chart_check.stdout.strip():
64
            charts = []
65
            for chart_path in chart_check.stdout.strip().split('\n'):
66
                chart_data = sandbox.files.read(chart_path)
67
                charts.append({
68
                    'name': chart_path.split('/')[-1],
69
                    'data': chart_data
70
                })
71
            output['charts'] = charts
72
        
73
        return output
74
    
75
    finally:
76
        sandbox.kill()
77
 
78
 
79
def generate_analysis_code(request: str, data_info: dict) -> str:
80
    """Generate pandas analysis code based on user request"""
81
    # This would typically use an LLM to generate code
82
    # For this example, we'll use a template
83
    
84
    return f'''
85
import pandas as pd
86
import numpy as np
87
import matplotlib.pyplot as plt
88
import json
89
 
90
df = pd.read_csv('/app/user_data.csv')
91
 
92
# Basic analysis
93
print("=== Data Overview ===")
94
print(f"Rows: {{len(df)}}")
95
print(f"Columns: {{len(df.columns)}}")
96
print()
97
 
98
print("=== Summary Statistics ===")
99
print(df.describe())
100
print()
101
 
102
print("=== Missing Values ===")
103
print(df.isnull().sum())
104
print()
105
 
106
# Generate a basic chart for numeric columns
107
numeric_cols = df.select_dtypes(include=[np.number]).columns[:4]  # Limit to 4
108
if len(numeric_cols) > 0:
109
    fig, axes = plt.subplots(1, len(numeric_cols), figsize=(4*len(numeric_cols), 4))
110
    if len(numeric_cols) == 1:
111
        axes = [axes]
112
    
113
    for ax, col in zip(axes, numeric_cols):
114
        df[col].hist(ax=ax, bins=20)
115
        ax.set_title(col)
116
    
117
    plt.tight_layout()
118
    plt.savefig('/app/histograms.png', dpi=100)
119
    print("\\nHistograms saved to /app/histograms.png")
120
'''
121
 
122
 
123
# Usage example
124
with open("user_upload.csv", "rb") as f:
125
    csv_content = f.read()
126
 
127
results = analyze_user_csv(csv_content, "Show me basic statistics and trends")
128
print(results['analysis'])
129
 

Performance Tips

1. Use Appropriate Data Types

python
1
# Convert types to reduce memory
2
sandbox.commands.run("""
3
python -c "
4
import pandas as pd
5
 
6
df = pd.read_csv('/app/large_data.csv')
7
 
8
# Before
9
print('Memory before:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
10
 
11
# Optimize types
12
df['category'] = df['category'].astype('category')
13
df['small_int'] = df['small_int'].astype('int16')
14
df['boolean'] = df['boolean'].astype('bool')
15
 
16
# After
17
print('Memory after:', df.memory_usage(deep=True).sum() / 1e6, 'MB')
18
"
19
""")
20
 

2. Process Large Files in Chunks

python
1
sandbox.commands.run("""
2
python -c "
3
import pandas as pd
4
 
5
# Process large CSV in chunks
6
chunk_size = 10000
7
results = []
8
 
9
for chunk in pd.read_csv('/app/huge_file.csv', chunksize=chunk_size):
10
    # Process each chunk
11
    chunk_result = chunk.groupby('category')['value'].sum()
12
    results.append(chunk_result)
13
 
14
# Combine results
15
final = pd.concat(results).groupby(level=0).sum()
16
print(final)
17
"
18
""")
19
 

3. Use Efficient File Formats

python
1
sandbox.commands.run("""
2
python -c "
3
import pandas as pd
4
 
5
df = pd.read_csv('/app/data.csv')
6
 
7
# Save as Parquet (much faster to read)
8
df.to_parquet('/app/data.parquet')
9
 
10
# Read parquet (faster than CSV)
11
df2 = pd.read_parquet('/app/data.parquet')
12
"
13
""")
14
 

Conclusion

Running Pandas in HopX sandboxes gives you:

  • Security - User data is isolated
  • Reliability - Consistent environment every time
  • Scalability - Process many datasets in parallel
  • Safety - Malicious data can't escape

Whether you're building a data analysis API, processing user uploads, or running automated reports, sandboxed Pandas provides the foundation for secure data science.

Resources