Skip to main content

Chat Models

DeepSeek R1 (Reasoning)

Best for complex reasoning, mathematics, and coding tasks.
import os
from openai import OpenAI

client = OpenAI(
    api_key=os.environ["CHEAPESTINFERENCE_API_KEY"],
    base_url="https://api.cheapestinference.ai/v1",
)

response = client.chat.completions.create(
    model="deepseek/deepseek-r1",
    messages=[
        {
            "role": "user",
            "content": "Solve this math problem step by step: If x^2 + 5x + 6 = 0, what are the values of x?"
        }
    ],
    temperature=0.7
)

print(response.choices[0].message.content)

Llama 3.1 70B (General Purpose)

Best all-around model for most tasks.
response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Write a professional email to schedule a meeting."}
    ]
)

Qwen 3 Next 80B (Multilingual)

Best for multilingual and coding tasks.
response = client.chat.completions.create(
    model="qwen/qwen-3-next-80b",
    messages=[
        {"role": "user", "content": "请用中文写一首关于春天的诗"}
    ]
)

Embedding Models

BGE Large EN v1.5

High-quality embeddings for RAG and semantic search.
embeddings = client.embeddings.create(
    model="BAAI/bge-large-en-v1.5",
    input=[
        "Artificial intelligence is transforming industries.",
        "Machine learning models require large datasets."
    ]
)

for i, embedding in enumerate(embeddings.data):
    print(f"Text {i}: {len(embedding.embedding)} dimensions")

Function Calling

Enable models to call external functions.
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get the current weather for a location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "City name"
                    }
                },
                "required": ["location"]
            }
        }
    }
]

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[{"role": "user", "content": "What's the weather in San Francisco?"}],
    tools=tools,
    tool_choice="auto"
)

# Check if model wants to call a function
if response.choices[0].message.tool_calls:
    tool_call = response.choices[0].message.tool_calls[0]
    print(f"Function: {tool_call.function.name}")
    print(f"Arguments: {tool_call.function.arguments}")

Structured Outputs

Get reliable JSON responses.
from pydantic import BaseModel

class EmailExtraction(BaseModel):
    subject: str
    sender: str
    recipient: str
    summary: str

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[
        {
            "role": "user",
            "content": "Extract key information from this email: [email text]"
        }
    ],
    response_format={"type": "json_object", "schema": EmailExtraction.model_json_schema()}
)

import json
data = json.loads(response.choices[0].message.content)
print(data["subject"])

Streaming Responses

Stream responses for better UX.
stream = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[{"role": "user", "content": "Tell me a story"}],
    stream=True
)

for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)

Batch Processing

Process multiple requests efficiently.
# Create a batch job
batch = client.batches.create(
    input_file_id="file-abc123",
    endpoint="/v1/chat/completions",
    completion_window="24h"
)

# Check status
status = client.batches.retrieve(batch.id)
print(f"Status: {status.status}")

# Download results when complete
if status.status == "completed":
    results = client.files.content(status.output_file_id)

Best Practices

  • Use smaller models (8B) for simple tasks
  • Use larger models (70B+) for complex reasoning
  • Lower temperature (0.3-0.7) for factual tasks
  • Higher temperature (0.7-1.0) for creative tasks
  • Implement retry logic with exponential backoff
  • Monitor rate limits
  • Log errors for debugging
  • Track token usage
  • Use smaller models when possible