Unstructured Data
Use the Unstructured Data service to process documents and videos, manage uploaded files, and enable semantic retrieval.
Supported Formats
formats = client.unstructured.formats()
print(formats.categories)
category = client.unstructured.detect("report.pdf")
print(category)Document Jobs
job = client.unstructured.jobs.create(
source={"source_type": "url", "url": "https://example.com/report.pdf"},
options={"chunking": {"enabled": True, "chunk_size": 1000}},
webhook_url="https://your-app.com/webhook",
)
result = client.unstructured.jobs.wait_until_complete(
job.job_id,
poll_interval=2.0,
timeout=300.0,
)
job_status = client.unstructured.jobs.get(job.job_id)
job_result = client.unstructured.jobs.result(job.job_id)
jobs = client.unstructured.jobs.list()
client.unstructured.jobs.cancel(job.job_id)Batch Processing
batch = client.unstructured.jobs.create_batch(
files=[
{"source_type": "url", "url": "https://example.com/doc1.pdf"},
{"source_type": "url", "url": "https://example.com/doc2.docx"},
{"source_type": "storage", "bucket": "my-bucket", "path": "reports/q1.pdf"},
],
options={"chunking": {"enabled": True}},
)
result = client.unstructured.jobs.wait_batch_complete(
batch.batch_id,
poll_interval=3.0,
timeout=600.0,
)
batches = client.unstructured.jobs.list_batches(page=1, page_size=20)| Method | Return Type | Description |
|---|---|---|
jobs.create(...) | Job | Start a single processing job |
jobs.get(job_id) | Job | Get job status |
jobs.list() | list[Job] | List all jobs |
jobs.result(job_id) | dict | Get processing result |
jobs.cancel(job_id) | dict | Cancel a running job |
jobs.create_batch(...) | BatchJob | Start a batch processing job |
jobs.get_batch(batch_id) | BatchJob | Get batch job status |
jobs.list_batches(...) | dict | List batch jobs |
File Management
upload = client.unstructured.files.upload("report.pdf")
file = client.unstructured.files.create(
file_id=upload["file_id"],
filename="report.pdf",
file_size_bytes=102400,
)
file = client.unstructured.files.import_from_storage(
storage_path="reports/q1.pdf",
bucket="my-bucket",
)
files = client.unstructured.files.list()
detail = client.unstructured.files.get(file.file_id)
viewer = client.unstructured.files.viewer(file.file_id)
job = client.unstructured.files.process(file.file_id)
url = client.unstructured.files.download_url(file.file_id)
client.unstructured.files.delete(file.file_id)Video Search and Chat
results = client.unstructured.video.search(
"product demo walkthrough",
top_k=10,
similarity_threshold=0.7,
rerank=True,
max_per_video=3,
)
for event in client.unstructured.video.chat(
[{"role": "user", "content": "Summarize the key points in this video"}],
stream=True,
):
print(event.data)
response = client.unstructured.video.chat(
[{"role": "user", "content": "What products are shown?"}],
stream=False,
)Video Embeddings
stats = client.unstructured.video.embeddings.stats()
coverage = client.unstructured.video.embeddings.coverage()
client.unstructured.video.embeddings.backfill(max_files=10)
client.unstructured.video.embeddings.delete("video_id")