ingest_files

Usage

Sync
Async

from morphik import Morphik

db = Morphik()

# Batch ingest files with shared metadata
result = db.ingest_files(
    files=["document1.pdf", "document2.docx", "image.png"], 
    metadata={"category": "reports"},
    use_colpali=True,
    parallel=True
)

# Process the results
for doc in result["documents"]:
    print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")

# Check for errors
for error in result["errors"]:
    print(f"Error ingesting {error.get('filename')}: {error.get('error')}")

from morphik import AsyncMorphik

async with AsyncMorphik() as db:
    # Batch ingest files with shared metadata
    result = await db.ingest_files(
        files=["document1.pdf", "document2.docx", "image.png"], 
        metadata={"category": "reports"},
        use_colpali=True,
        parallel=True
    )
    
    # Process the results
    for doc in result["documents"]:
        print(f"Successfully ingested: {doc.filename} (ID: {doc.external_id})")
    
    # Check for errors
    for error in result["errors"]:
        print(f"Error ingesting {error.get('filename')}: {error.get('error')}")

Parameters

files (List[Union[str, bytes, BinaryIO, Path]]): List of files to ingest (path strings, bytes, file objects, or Path objects)
metadata (Dict[str, Any] | List[Dict[str, Any]], optional): Metadata to apply to the files. Can be either:
- A single dict to apply to all files
- A list of dicts, one per file (must match the length of files)
use_colpali (bool, optional): Whether to use ColPali-style embedding model. Defaults to True.
parallel (bool, optional): Whether to process files in parallel. Defaults to True.

Typed Metadata

When specifying metadata, you can include Python datetime, date, Decimal, or numeric types. The SDK normalizes them, sends the accompanying metadata_types, and unlocks the advanced queries outlined in Metadata Filtering.

Returns

An object containing:

documents: List of successfully ingested Document objects
errors: List of errors encountered during ingestion (each error is a dict with ‘filename’ and ‘error’ keys)

Advanced Examples

Per-File Metadata

Sync
Async

# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]

# Metadata must match the length of files list
metadata_list = [
    {"category": "reports", "author": "Alice"},
    {"category": "data", "source": "database"},
    {"category": "presentations", "department": "marketing"}
]

result = db.ingest_files(
    files=files,
    metadata=metadata_list
)

# Ingest files with different metadata for each file
files = ["report.pdf", "data.csv", "presentation.pptx"]

# Metadata must match the length of files list
metadata_list = [
    {"category": "reports", "author": "Alice"},
    {"category": "data", "source": "database"},
    {"category": "presentations", "department": "marketing"}
]

result = await db.ingest_files(
    files=files,
    metadata=metadata_list
)

Using Different File Input Types

Sync
Async

import io
from pathlib import Path

# Mixing different file input types
file1 = "document.pdf"                       # Path string
file2 = Path("image.png")                    # Path object
file3 = open("data.csv", "rb")               # File object
file4 = b"Hello, world!"                     # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data")   # BytesIO (requires filename)

result = db.ingest_files(
    files=[file1, file2, file3, file4, file5],
    metadata=[
        {"type": "document"},
        {"type": "image"},
        {"type": "data"},
        {"type": "text", "filename": "hello.txt"},
        {"type": "text", "filename": "memory-data.txt"}
    ]
)

# Don't forget to close file objects
file3.close()

import io
from pathlib import Path

# Mixing different file input types
file1 = "document.pdf"                       # Path string
file2 = Path("image.png")                    # Path object
file3 = open("data.csv", "rb")               # File object
file4 = b"Hello, world!"                     # Bytes (requires filename)
file5 = io.BytesIO(b"Some in-memory data")   # BytesIO (requires filename)

result = await db.ingest_files(
    files=[file1, file2, file3, file4, file5],
    metadata=[
        {"type": "document"},
        {"type": "image"},
        {"type": "data"},
        {"type": "text", "filename": "hello.txt"},
        {"type": "text", "filename": "memory-data.txt"}
    ]
)

# Don't forget to close file objects
file3.close()

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Knowledge Graph Operations

Chat & Conversation Management

Document Management

Usage & Monitoring

Usage

Parameters

Typed Metadata

Returns

Advanced Examples

Per-File Metadata

Using Different File Input Types

Client

Document Ingestion

Document Retrieval

Data Organization

Document Updates

Batch Operations

Knowledge Graph Operations

Chat & Conversation Management

Document Management

Usage & Monitoring

​Usage

​Parameters

​Typed Metadata

​Returns

​Advanced Examples

​Per-File Metadata

​Using Different File Input Types

Usage

Parameters

Typed Metadata

Returns

Advanced Examples

Per-File Metadata

Using Different File Input Types