SLURM Cluster Tutorial

This tutorial demonstrates how to use Clustrix with SLURM (Simple Linux Utility for Resource Management) clusters, one of the most common cluster schedulers in high-performance computing.

Prerequisites

  1. Access to a SLURM cluster

  2. SSH key setup (see SSH Key Setup for Remote Clusters)

  3. Clustrix installed with: pip install clustrix

Configuration Options

Option 1: Interactive Widget (Recommended for Jupyter)

For Jupyter notebook users, use the interactive configuration widget:

import clustrix  # Auto-loads the magic command

# Use the magic command to open the configuration widget
%%clusterfy
# Interactive widget appears with SLURM templates and GUI configuration

The widget includes pre-built SLURM templates and allows you to save configurations for reuse.

Option 2: Programmatic Configuration

Configure Clustrix programmatically for your SLURM cluster:

from clustrix import configure

configure(
    cluster_type="slurm",
    cluster_host="slurm.university.edu",
    username="your_username",
    key_file="~/.ssh/slurm_key",  # Optional if using SSH agent
    remote_work_dir="/scratch/your_username/clustrix"
)

Simple Job Execution

Execute a basic function on the SLURM cluster:

from clustrix import cluster

@cluster(cores=4, memory="8GB", time="01:00:00")
def compute_pi(n_samples):
    """Monte Carlo estimation of pi."""
    import random
    inside_circle = 0

    for _ in range(n_samples):
        x, y = random.random(), random.random()
        if x*x + y*y <= 1:
            inside_circle += 1

    return 4 * inside_circle / n_samples

# This will submit a job to SLURM and wait for results
pi_estimate = compute_pi(1000000)
print(f"Pi estimate: {pi_estimate}")

Resource Specification

SLURM-specific resource options:

@cluster(
    cores=16,              # Number of CPU cores
    memory="32GB",         # Memory requirement
    time="04:00:00",       # Wall time (HH:MM:SS)
    partition="compute",   # SLURM partition
    nodes=1,              # Number of nodes
    ntasks_per_node=16,   # Tasks per node
    account="research123"  # SLURM account
)
def intensive_computation():
    import numpy as np
    # Simulate intensive computation
    matrix = np.random.rand(10000, 10000)
    eigenvalues = np.linalg.eigvals(matrix)
    return len(eigenvalues)

Advanced Configuration

Environment and Module Setup

Configure environment modules and variables:

configure(
    cluster_type="slurm",
    cluster_host="slurm.hpc.edu",
    username="researcher",

    # Load required modules
    module_loads=[
        "python/3.11",
        "gcc/11.2",
        "openmpi/4.1"
    ],

    # Set environment variables
    environment_variables={
        "OMP_NUM_THREADS": "16",
        "PYTHONPATH": "/home/researcher/libs:$PYTHONPATH"
    },

    # Default resources
    default_cores=8,
    default_memory="16GB",
    default_time="02:00:00",
    default_partition="standard"
)

Configuration File

Create ~/.clustrix/config.yml:

cluster_type: "slurm"
cluster_host: "slurm.university.edu"
username: "researcher"
key_file: "~/.ssh/slurm_key"
remote_work_dir: "/scratch/researcher/clustrix"

# SLURM-specific settings
default_partition: "compute"
default_account: "research_group"

# Resource defaults
default_cores: 8
default_memory: "16GB"
default_time: "02:00:00"

# Environment setup
module_loads:
  - "python/3.11"
  - "gcc/11.2"
  - "intel-mpi/2021"

environment_variables:
  OMP_NUM_THREADS: "8"
  MKL_NUM_THREADS: "8"

Parallel Processing Examples

Array Jobs with Loop Parallelization

Process multiple datasets in parallel:

@cluster(cores=4, memory="8GB", parallel=True)
def process_dataset(dataset_id, analysis_type="standard"):
    """Process a single dataset."""
    import numpy as np
    import time

    # Simulate data loading
    print(f"Processing dataset {dataset_id} with {analysis_type}")
    data = np.random.rand(1000, 100)

    # Simulate analysis
    if analysis_type == "intensive":
        time.sleep(2)  # Simulate longer computation
        result = np.mean(data**3)
    else:
        result = np.mean(data**2)

    return {
        "dataset_id": dataset_id,
        "result": result,
        "analysis_type": analysis_type
    }

# Process multiple datasets
dataset_ids = range(10)
results = []

for dataset_id in dataset_ids:
    result = process_dataset(dataset_id, analysis_type="intensive")
    results.append(result)

print(f"Processed {len(results)} datasets")

Machine Learning Workflow

Distributed hyperparameter tuning:

@cluster(cores=8, memory="16GB", time="03:00:00")
def train_model(params):
    """Train ML model with given hyperparameters."""
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.datasets import make_regression
    from sklearn.model_selection import cross_val_score
    import numpy as np

    # Generate synthetic dataset
    X, y = make_regression(
        n_samples=10000,
        n_features=20,
        noise=0.1,
        random_state=42
    )

    # Create model with parameters
    model = RandomForestRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        random_state=42
    )

    # Cross-validation
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')

    return {
        'params': params,
        'mean_score': np.mean(scores),
        'std_score': np.std(scores)
    }

# Hyperparameter grid
param_grid = [
    {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2},
    {'n_estimators': 200, 'max_depth': 15, 'min_samples_split': 5},
    {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 10},
    {'n_estimators': 150, 'max_depth': 12, 'min_samples_split': 3},
]

# Train models in parallel
results = []
for params in param_grid:
    result = train_model(params)
    results.append(result)

# Find best parameters
best_result = max(results, key=lambda x: x['mean_score'])
print(f"Best parameters: {best_result['params']}")
print(f"Best score: {best_result['mean_score']:.4f}")

Job Management

Job Status Monitoring

from clustrix.executor import ClusterExecutor
from clustrix.config import get_config

# Get current configuration
config = get_config()
executor = ClusterExecutor(config)

# Submit job and get job ID
@cluster(cores=4, memory="8GB")
def long_running_task():
    import time
    time.sleep(300)  # 5 minutes
    return "Task completed"

# For actual job monitoring, you would need to modify
# the executor to return job IDs
result = long_running_task()

Error Handling

@cluster(cores=2, memory="4GB")
def error_prone_function(divide_by_zero=False):
    """Function that may raise errors."""
    import numpy as np

    if divide_by_zero:
        return 1 / 0  # This will raise ZeroDivisionError

    # Normal computation
    data = np.random.rand(1000)
    return np.mean(data)

try:
    # This will work
    result = error_prone_function(divide_by_zero=False)
    print(f"Success: {result}")

    # This will raise an error
    result = error_prone_function(divide_by_zero=True)
except ZeroDivisionError as e:
    print(f"Caught error from remote execution: {e}")

Best Practices

Resource Estimation

# Estimate resources based on problem size
def estimate_resources(data_size_gb):
    """Estimate resources needed for computation."""

    # Rule of thumb: 2GB RAM per GB of data
    memory_gb = max(4, int(data_size_gb * 2))

    # More cores for larger datasets (up to 16)
    cores = min(16, max(2, int(data_size_gb / 2)))

    # Time based on data size (minimum 30 minutes)
    hours = max(0.5, data_size_gb * 0.1)
    time_str = f"{int(hours):02d}:{int((hours % 1) * 60):02d}:00"

    return {
        'cores': cores,
        'memory': f"{memory_gb}GB",
        'time': time_str
    }

# Use estimated resources
data_size = 10  # GB
resources = estimate_resources(data_size)

@cluster(**resources)
def process_large_dataset():
    # Process your large dataset
    pass

Debugging and Troubleshooting

Enable Debug Logging

import logging
logging.basicConfig(level=logging.DEBUG)

from clustrix import configure, cluster

# This will show detailed SSH and job submission logs
configure(cluster_type="slurm", cluster_host="your-cluster")

Common Issues

Job Fails with “Permission Denied”

Check SSH setup and file permissions:

# Test SSH connection
ssh your-cluster "squeue --version"

# Check permissions
ssh your-cluster "ls -la ~/.ssh/"

Jobs Stuck in Queue

Check SLURM queue status:

# Check your jobs
squeue -u $USER

# Check partition availability
sinfo -p your_partition

Out of Memory Errors

Increase memory allocation:

@cluster(cores=4, memory="32GB")  # Increase from default
def memory_intensive_task():
    import numpy as np
    # Large arrays need more memory
    big_array = np.random.rand(50000, 50000)
    return np.sum(big_array)

Complete Example

Here’s a complete scientific computing example:

from clustrix import configure, cluster
import numpy as np

# Configure SLURM cluster
configure(
    cluster_type="slurm",
    cluster_host="slurm.university.edu",
    username="researcher",
    remote_work_dir="/scratch/researcher/clustrix",

    # Environment setup
    module_loads=["python/3.11", "intel-mkl/2021"],
    environment_variables={"MKL_NUM_THREADS": "8"},

    # Default resources
    default_cores=8,
    default_memory="16GB",
    default_time="02:00:00"
)

@cluster(cores=16, memory="32GB", time="04:00:00")
def monte_carlo_simulation(n_trials, n_steps):
    """Monte Carlo simulation of random walk."""
    import numpy as np

    results = []
    for trial in range(n_trials):
        # Random walk
        steps = np.random.choice([-1, 1], size=n_steps)
        positions = np.cumsum(steps)

        # Calculate statistics
        max_displacement = np.max(np.abs(positions))
        final_position = positions[-1]

        results.append({
            'trial': trial,
            'max_displacement': max_displacement,
            'final_position': final_position
        })

    return results

# Run simulation
print("Starting Monte Carlo simulation on SLURM cluster...")
results = monte_carlo_simulation(n_trials=1000, n_steps=10000)

# Analyze results
final_positions = [r['final_position'] for r in results]
max_displacements = [r['max_displacement'] for r in results]

print(f"Mean final position: {np.mean(final_positions):.2f}")
print(f"Mean max displacement: {np.mean(max_displacements):.2f}")
print(f"Std final position: {np.std(final_positions):.2f}")

This tutorial covers the essential aspects of using Clustrix with SLURM clusters. For more advanced topics, see the API documentation and other tutorials.