Skip to main content

Why Use Batch API?

Higher Throughput

Process millions of requests without rate limits

Async Processing

Submit jobs and retrieve results when ready

Bulk Operations

Process entire datasets in one go

Receive notifications

Webhook notifications and/or email notifications when the batch is complete

Quick Start

1. Prepare Input File

Create a JSONL file with your requests:
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "messages": [{"role": "user", "content": "What is AI?"}]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", "messages": [{"role": "user", "content": "Explain ML"}]}}

2. Upload File

import os
import requests

headers = {
  "Authorization": f"Bearer {os.environ['CHEAPESTINFERENCE_API_KEY']}"
}

# Upload input file
with open("batch_requests.jsonl", "rb") as f:
    files = {"file": ("batch_requests.jsonl", f, "application/jsonl")}
    data = {"purpose": "batch"}
    resp = requests.post(
        "https://api.cheapestinference.ai/v1/files",
        headers=headers,
        files=files,
        data=data,
    )
    resp.raise_for_status()
    batch_file = resp.json()

3. Create Batch Job

# Create batch
resp = requests.post(
    "https://api.cheapestinference.ai/v1/batches",
    headers={**headers, "Content-Type": "application/json"},
    json={
        "input_file_id": batch_file["id"],
        "endpoint": "/v1/chat/completions",
        "completion_window": "24h",
    },
)
resp.raise_for_status()
batch = resp.json()
print(f"Batch ID: {batch['id']}")

4. Check Status

# Check status
status = requests.get(
    f"https://api.cheapestinference.ai/v1/batches/{batch['id']}",
    headers=headers,
).json()
print(f"Status: {status['status']}")
print(f"Progress: {status['request_counts']['completed']}/{status['request_counts']['total']}")

5. Download Results

# When complete, download results
if status["status"] == "completed":
    result = requests.get(
        f"https://api.cheapestinference.ai/v1/files/{status['output_file_id']}",
        headers=headers,
    )
    result.raise_for_status()

    import json
    for line in result.text.split('\n'):
        if line.strip():
            row = json.loads(line)
            print(f"{row['custom_id']}: {row['response']['body']['choices'][0]['message']['content']}")

Use Cases

Data Processing

Process thousands of documents:
# Create requests for 10,000 documents
requests_payload = []
for i, doc in enumerate(documents):
    requests_payload.append({
        "custom_id": f"doc-{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            "messages": [{"role": "user", "content": f"Summarize: {doc}"}]
        }
    })

Model Evaluation

Evaluate model performance on test sets:
# Batch evaluation
requests_payload = []
for i, (question, expected) in enumerate(test_set):
    requests_payload.append({
        "custom_id": f"eval-{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            "messages": [{"role": "user", "content": question}]
        }
    })

Content Generation

Generate content at scale:
# Generate 1000 product descriptions
for i, product in enumerate(products):
    requests.append({
        "custom_id": f"product-{product.id}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            "messages": [{"role": "user", "content": f"Write description for {product.name}"}]
        }
    })

Best Practices

Use unique identifiers to match results with inputs
Choose 24h for most jobs, 72h for very large batches
Check status periodically, not too frequently
Check error file for failed requests and retry