[{"feature_extractor_name":"text_extractor","version":"v1","feature_extractor_id":"text_extractor_v1","description":"Extracts dense vector embeddings from text using E5-Large multilingual model. Optimized for semantic search, RAG applications, and general-purpose text retrieval. Supports text chunking/decomposition with multiple splitting strategies. With source_type='youtube', resolves YouTube URLs to caption text before embedding. Fast (5ms/doc) and supports 100+ languages.","icon":"file-text","source":"builtin","input_schema":{"description":"Input schema for the text extractor.","examples":[{"text":"How do I reset my password?"},{"text":"wireless bluetooth headphones with noise cancellation"}],"properties":{"text":{"description":"Text content to process into embeddings.","minLength":1,"title":"Text","type":"string"}},"required":["text"],"title":"TextExtractorInput","type":"object"},"output_schema":{"description":"Output schema for text extractor documents.\n\nWhen source_type='youtube', additional video metadata fields are populated.","examples":[{"text":"Meal kit delivery service with chef-crafted recipes","text_extractor_v1_embedding":[0.023,-0.041,0.018]}],"properties":{"text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"The processed text content for this document.","title":"Text"},"text_extractor_v1_embedding":{"anyOf":[{"items":{"type":"number"},"maxItems":1024,"minItems":1024,"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embedding (1024-dimensional).","title":"Text Extractor V1 Embedding"},"video_id":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"YouTube video ID.","title":"Video Id"},"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Video title.","title":"Title"},"channel":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"YouTube channel name.","title":"Channel"},"video_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Source YouTube video URL.","title":"Video Url"},"duration_seconds":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Total video duration in seconds.","title":"Duration Seconds"},"publish_date":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Video publish date (ISO format).","title":"Publish Date"},"start_ms":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Segment start time in milliseconds.","title":"Start Ms"},"end_ms":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Segment end time in milliseconds.","title":"End Ms"},"segment_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Index of this segment within the video (0-based).","title":"Segment Index"},"total_segments":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Total number of segments from this video.","title":"Total Segments"}},"title":"TextExtractorOutput","type":"object"},"parameter_schema":{"$defs":{"TextSplitStrategy":{"description":"Strategy for splitting text into chunks.","enum":["characters","words","sentences","paragraphs","pages","time_segments","none"],"title":"TextSplitStrategy","type":"string"}},"description":"Parameters for the text extractor.\n\nThe text extractor generates dense vector embeddings optimized for semantic similarity search.\nIt uses the E5-Large multilingual model to convert text into 1024-dimensional vectors.\n\nWhen ``source_type`` is ``\"youtube\"``, the extractor first resolves YouTube URLs\nto caption text via yt-dlp before chunking and embedding. Use ``split_by=\"time_segments\"``\nwith ``segment_length_seconds`` to segment captions by time window.","examples":[{"chunk_overlap":0,"chunk_size":1000,"extractor_type":"text_extractor","split_by":"none"},{"chunk_overlap":1,"chunk_size":5,"extractor_type":"text_extractor","split_by":"sentences"},{"extractor_type":"text_extractor","language":"en","segment_length_seconds":120,"source_type":"youtube","split_by":"time_segments"}],"properties":{"extractor_type":{"const":"text_extractor","default":"text_extractor","description":"Discriminator field for parameter type identification.","title":"Extractor Type","type":"string"},"source_type":{"default":"text","description":"Source content type. Use 'youtube' to resolve YouTube URLs to caption text before embedding. Default: 'text' (plain text input).","enum":["text","youtube"],"title":"Source Type","type":"string"},"split_by":{"$ref":"#/$defs/TextSplitStrategy","default":"none","description":"Strategy for splitting text into multiple documents."},"chunk_size":{"default":1000,"description":"Target size for each chunk.","maximum":10000,"minimum":1,"title":"Chunk Size","type":"integer"},"chunk_overlap":{"default":0,"description":"Number of units to overlap between consecutive chunks.","maximum":5000,"minimum":0,"title":"Chunk Overlap","type":"integer"},"segment_length_seconds":{"default":120,"description":"Length of each transcript segment in seconds (for time_segments split strategy). Shorter segments give more precise search results but more documents.","maximum":600,"minimum":30,"title":"Segment Length Seconds","type":"integer"},"language":{"default":"en","description":"Preferred language code for YouTube captions (when source_type='youtube').","title":"Language","type":"string"},"extract_captions":{"default":true,"description":"Extract auto-captions or manual subtitles from YouTube videos (when source_type='youtube'). Falls back to video description if False.","title":"Extract Captions","type":"boolean"},"response_shape":{"anyOf":[{"type":"string"},{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Define custom structured output using LLM extraction.","title":"Response Shape"},"llm_provider":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"LLM provider for structured extraction (openai, google, anthropic).","title":"Llm Provider"},"llm_model":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Specific LLM model for structured extraction.","title":"Llm Model"},"llm_api_key":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"API key for LLM operations (BYOK - Bring Your Own Key). Supports:\n- Direct key: 'sk-proj-abc123...'\n- Secret reference: '{{SECRET.openai_api_key}}'\n\nWhen using secret reference, the key is loaded from your organization's secrets vault at runtime. Store secrets via POST /v1/organizations/secrets.\n\nIf not provided, uses Mixpeek's default API keys.","title":"Llm Api Key"}},"title":"TextExtractorParams","type":"object"},"supported_input_types":["text","string"],"max_inputs":{"text":1},"default_parameters":{},"costs":{"tier":1,"tier_label":"SIMPLE","rates":[{"unit":"1k_tokens","credits_per_unit":1,"description":"Text embedding per 1K tokens"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://text_extractor@v1/multilingual_e5_large_instruct_v1","name":"multilingual_e5_large_instruct_v1","description":"Vector index for text embeddings.","type":"single","index":{"name":"text_extractor_v1_embedding","description":"Dense vector embedding for text.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["string","text"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["chunk_index","video_id","segment_index"]},{"feature_extractor_name":"multimodal_extractor","version":"v1","feature_extractor_id":"multimodal_extractor_v1","description":"**Multimodal extractor for VIDEO, AUDIO, IMAGE, TEXT, and GIF content** using unified Vertex embeddings (1408D).\n\nProcesses diverse media types in a unified embedding space for cross-modal search. Videos/audio are decomposed into segments with transcription, embeddings, OCR, and descriptions. Images and text are embedded directly.\n\n**Pipeline Steps:**\n1. Filter dataset to collection (if collection_id provided)\n2. Apply input mappings\n3. Detect content types via sampling (video/audio/image/text)\n4. **Content Routing:**\n   - **Video:** FFmpeg chunking (time/scene/silence) → Steps 5-9\n   - **Audio:** FFmpeg audio chunking (time/silence) → Steps 5-7\n   - **Image:** Direct to Step 8\n   - **Text:** Direct to Step 8\n   - **Mixed:** Branch by type, process separately, then union\n5. **Conditional:** Transcription (if run_transcription=true)\n   - Whisper API or Local GPU\n   - Speech-to-text for audio tracks\n6. **Conditional:** Transcription embeddings (if run_transcription_embedding=true)\n   - E5 text embeddings (1024D) from transcribed text\n7. Multimodal embeddings (if run_multimodal_embedding=true)\n   - Vertex AI embeddings (1408D) for all content types\n   - Unified space enables cross-modal search\n8. **Conditional:** Thumbnail generation (if enable_thumbnails=true, visual content only)\n   - 640px width at 85% quality\n   - Upload to S3 with optional CDN\n9. **Conditional:** Visual analysis (if run_video_description OR run_ocr=true, visual content only)\n   - Gemini-based descriptions and/or OCR\n10. **Output:** Segment/document records with embeddings and features\n\n**Use for:** Unified multimodal search, video content libraries, educational content, media platforms, cross-modal retrieval.\n\n**Processing speed:** Videos 0.5-2x realtime, Images <1s, Text <100ms","icon":"film","source":"builtin","input_schema":{"description":"Input schema for the multimodal extractor.\n\nDefines the media content (video, image, text, gif) that will be processed and embedded.\nUses Google Vertex multimodal embeddings to create a unified embedding space across all media types.\n\n**Multimodal Support**:\n    - VIDEO: Decomposed into segments with transcription, visual embeddings, and OCR\n    - IMAGE: Direct visual embeddings (no decomposition)\n    - TEXT: Direct text embeddings\n    - GIF: Treated as video, decomposed frame-by-frame\n\n**Bucket Schema Mapping**:\n    When mapping from bucket schema fields to extractor inputs:\n\n    - BucketSchemaFieldType.VIDEO → maps to 'video' input\n    - BucketSchemaFieldType.IMAGE → maps to 'image' input\n    - BucketSchemaFieldType.TEXT/STRING → maps to 'text' input\n\n    **GIF files**: There is no BucketSchemaFieldType.GIF. Instead, GIFs can be\n    declared as either IMAGE or VIDEO in your bucket schema:\n\n    - As IMAGE: GIF detected via MIME type, embedded as static image (first frame)\n    - As VIDEO: GIF detected via MIME type, decomposed frame-by-frame\n\n    Use VIDEO schema type for animated GIFs requiring frame-level search.\n\nRequirements:\n    - Provide ONE of: video, image, text, or gif\n    - VIDEO/GIF: Formats MP4, MOV, AVI, MKV, WebM, FLV, GIF\n    - IMAGE: Formats JPG, PNG, WebP, BMP, GIF (static)\n    - TEXT: Plain text string\n    - URLs must be accessible (S3, HTTP, HTTPS)","examples":[{"description":"Video: Educational lecture","video":"s3://education-videos/lecture-machine-learning-101.mp4"},{"description":"Image: Product photo","image":"https://cdn.example.com/products/laptop-pro-2024.jpg"},{"description":"Text: Product description","text":"High-performance laptop with M3 chip, 16GB RAM, perfect for developers"}],"oneOf":[{"required":["video"]},{"required":["image"]},{"required":["text"]},{"required":["gif"]}],"properties":{"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to video file for processing. Video will be decomposed into segments based on split_method, and each segment processed for features (transcription, embeddings, OCR, etc.). Supports formats: MP4, MOV, AVI, MKV, WebM, FLV. Recommended: 720p-1080p resolution, <2 hours duration. Examples: 's3://bucket/video.mp4', 'https://example.com/video.mp4'","examples":["s3://my-bucket/videos/lecture-01.mp4","https://storage.googleapis.com/my-videos/tutorial.mp4"],"title":"Video"},"image":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to image file for embedding. Image will be embedded directly using Vertex multimodal embeddings (1408D). Supports formats: JPG, PNG, WebP, BMP. Recommended: <10MB file size. Examples: 's3://bucket/image.jpg', 'https://example.com/photo.png'","examples":["s3://my-bucket/images/product.jpg","https://cdn.example.com/photos/banner.png"],"title":"Image"},"text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Plain text content for embedding. Text will be embedded directly using Vertex multimodal embeddings (1408D). Ideal for creating a unified embedding space with images and videos. Examples: Product descriptions, captions, summaries, labels","examples":["A red sports car driving on a mountain road at sunset","Machine learning tutorial covering neural networks and backpropagation"],"title":"Text"},"gif":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to GIF file for processing. GIF will be treated as video and decomposed frame-by-frame. Each frame processed for visual embeddings. Supports animated GIFs. Note: When using bucket schema, declare GIFs as VIDEO type for frame-level processing, or IMAGE type for static embedding. There is no separate GIF bucket schema type - the extractor auto-detects GIFs via MIME type. Examples: 's3://bucket/animation.gif', 'https://example.com/meme.gif'","examples":["s3://my-bucket/animations/loading.gif","https://media.example.com/reactions/thumbs-up.gif"],"title":"Gif"},"custom_thumbnail":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Optional custom thumbnail image URL or S3 path. When provided, this thumbnail is used instead of auto-generating one from the media. Useful when you have a pre-selected representative image for your content. Supports formats: JPG, PNG, WebP. Examples: 's3://bucket/thumb.jpg', 'https://example.com/poster.png'","examples":["s3://my-bucket/thumbnails/custom-thumb.jpg","https://cdn.example.com/posters/video-poster.png"],"title":"Custom Thumbnail"}},"title":"MultimodalExtractorInput","type":"object"},"output_schema":{"description":"Output schema for a single document produced by the multimodal extractor.\n\nEach video segment produces one document with multimodal features.\n\nOutput Structure:\n    - One document per video segment (timespan)\n    - Contains timing information (start/end timestamps)\n    - Includes all extracted features (transcription, embeddings, OCR, etc.)\n    - References source video via source_object_id\n    - Searchable via text (transcription) and visual (embeddings) content\n    - When response_shape is provided, includes custom structured fields (NEW)\n\nCustom Structured Extraction (NEW):\n    When response_shape parameter is set in MultimodalExtractorParams, custom fields\n    are automatically added to this output schema using get_multimodal_extractor_output_schema().\n    This enables extraction of structured data like product details, entity information,\n    or custom metadata fields that are stored directly in document metadata.\n\n    Example: With response_shape defining \"products\" and \"aesthetic\" fields,\n    each document will have those fields in addition to the base fields below.\n\nUse Cases:\n    - Search for specific moments in videos by spoken content\n    - Find visual scenes by description or similarity\n    - Extract and search text appearing in videos (signs, captions, etc.)\n    - Navigate to relevant segments via start_time/end_time\n    - Analyze video content at granular level\n    - Extract structured product/entity data for e-commerce and fashion (NEW)","properties":{"start_time":{"description":"Start time of the segment in seconds","title":"Start Time","type":"number"},"end_time":{"description":"End time of the segment in seconds","title":"End Time","type":"number"},"duration":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Total duration of the entire source video in seconds. This represents the full video length, not the segment duration. Useful for calculating segment position within the video (e.g., start_time/duration). Only populated for video content; None for images and text.","title":"Duration"},"transcription":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Transcription of the audio in the segment","title":"Transcription"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Generated description of the video segment","title":"Description"},"ocr_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Text extracted from video frames (OCR)","title":"Ocr Text"},"json_output":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Optional raw JSON output from underlying models.","title":"Json Output"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of the thumbnail image for this video segment. Automatically generated during processing. Useful for UI previews and visual navigation.","examples":["s3://mixpeek-storage/ns_123/obj_456/thumbnails/thumb_0.jpg"],"title":"Thumbnail Url"},"source_video_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of the original source video this segment was extracted from. Maintains data lineage from bucket object to processed segment. Use for tracking provenance and accessing the full original video. OPTIONAL for images and text, POPULATED for video segments.","examples":["s3://mixpeek-storage/ns_123/obj_456/original.mp4","s3://user-bucket/videos/campaign_video.mp4"],"title":"Source Video Url"},"video_segment_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of this specific video segment file. Enables collection-to-collection decomposition pipelines where this segment can be re-processed by another collection with different settings. Example use cases: - Time segments (5s) → Scene detection within each segment - Coarse segments → Fine-grained analysis - Initial extraction → Enhanced processing with different models OPTIONAL for initial processing, REQUIRED for decomposition chains.","examples":["s3://mixpeek-storage/ns_123/obj_456/segments/segment_0.mp4","s3://mixpeek-storage/ns_123/obj_456/segments/segment_1.mp4"],"title":"Video Segment Url"},"multimodal_extractor_v1_multimodal_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embeddings (1408D) for multimodal content (video/image/gif/text) using Google Vertex AI. Captures visual and contextual information from all media types in a unified embedding space. Used for semantic search and similarity matching across all content types.","title":"Multimodal Extractor V1 Multimodal Embedding"},"multimodal_extractor_v1_transcription_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embeddings (1024D) for the transcription text using E5-Large. Captures semantic meaning of spoken content. Used for text-based search across video transcriptions.","title":"Multimodal Extractor V1 Transcription Embedding"},"internal_metadata":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Internal metadata for the video segment. Contains processing details, model versions, and diagnostic information. NOT REQUIRED for typical usage.","title":"Internal Metadata"}},"required":["start_time","end_time"],"title":"MultimodalExtractorOutput","type":"object"},"parameter_schema":{"$defs":{"GenerationConfig":{"description":"Configuration for generative models.","properties":{"candidate_count":{"default":1,"description":"Number of candidate responses to generate for video description.","title":"Candidate Count","type":"integer"},"max_output_tokens":{"default":1024,"description":"Maximum number of tokens for the generated video description.","title":"Max Output Tokens","type":"integer"},"temperature":{"default":0.7,"description":"Controls randomness for video description generation. Higher is more random.","title":"Temperature","type":"number"},"top_p":{"default":0.8,"description":"Nucleus sampling (top-p) for video description generation.","title":"Top P","type":"number"},"response_mime_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"MIME type for response (e.g., 'application/json')","title":"Response Mime Type"},"response_schema":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"JSON schema for structured output","title":"Response Schema"}},"title":"GenerationConfig","type":"object"},"SplitMethod":{"description":"Split methods for video extraction.","enum":["time","scene","silence"],"title":"SplitMethod","type":"string"}},"description":"Parameters for the multimodal extractor.\n\nThe multimodal extractor processes video, audio, image, text, and GIF content in a unified embedding space.\nVideos/GIFs/Audio are decomposed into segments with transcription, visual analysis (video only), OCR, and embeddings.\nImages and text are embedded directly without decomposition.\n\n**When to Use**:\n    - Video content libraries requiring searchable segments\n    - Audio content (podcasts, lectures, music) requiring transcription and search\n    - Media platforms with search across spoken and visual content\n    - Educational content with lecture videos and demonstrations\n    - Surveillance/security footage requiring event detection\n    - Social media platforms with user-generated video content\n    - Broadcasting/streaming services with large video catalogs\n    - Training video repositories with instructional content\n    - Marketing/advertising analytics for video campaigns\n\n**When NOT to Use**:\n    - Static image collections → Use image_extractor instead\n    - Very short videos (<5 seconds) → Overhead not worth it\n    - Real-time live streams → Use specialized streaming extractors\n    - Extremely high-resolution videos (8K+) → Consider downsampling first\n\n**Decomposition Methods**:\n\n    | Method | Use Case | Accuracy | Segments/Min | Best For |\n    |--------|----------|----------|--------------|----------|\n    | **TIME** | Fixed intervals | N/A | 60/interval_sec | General purpose, audio/video chunking |\n    | **SCENE** | Visual changes | 85-90% | Variable (2-20) | Movies, dynamic content (video only) |\n    | **SILENCE** | Audio pauses | 80-85% | Variable (5-30) | Lectures, presentations, audio/video |\n\n**Feature Extraction Options**:\n    - Transcription: Speech-to-text using Whisper (95%+ accuracy)\n    - Multimodal Embeddings: Unified embeddings from Vertex AI (1408D) for video/image/gif/text\n    - Transcription Embeddings: Text embeddings from E5-Large (1024D)\n    - OCR: Text extraction from video frames using Gemini Vision\n    - Descriptions: AI-generated segment summaries using Gemini\n    - Thumbnails: Visual preview images for each segment\n\n**Performance Characteristics**:\n    - Processing Speed: 0.5-2x realtime (depends on features enabled)\n    - Example: 10min video → 5-20 minutes processing time\n    - Transcription: ~200ms per second of audio\n    - Visual Embedding: ~50ms per segment\n    - OCR: ~300ms per segment\n    - Description: ~2s per segment (if enabled)\n\nRequirements:\n    - video URL: REQUIRED (accessible video file)\n    - All feature parameters: OPTIONAL (defaults provided)","examples":[{"description":"Standard video processing with 10-second intervals (default)","enable_thumbnails":true,"extractor_type":"multimodal_extractor","run_multimodal_embedding":true,"split_method":"time","time_split_interval":10,"use_case":"General-purpose video indexing for search and discovery"},{"description":"Educational content with transcription + embeddings","extractor_type":"multimodal_extractor","run_multimodal_embedding":true,"run_transcription":true,"run_transcription_embedding":true,"split_method":"time","time_split_interval":10,"transcription_language":"en","use_case":"Lecture videos and online courses requiring searchable spoken content"}],"properties":{"extractor_type":{"const":"multimodal_extractor","default":"multimodal_extractor","description":"Discriminator field for parameter type identification. Must be 'multimodal_extractor'.","title":"Extractor Type","type":"string"},"split_method":{"$ref":"#/$defs/SplitMethod","default":"time","description":"The PRIMARY control for video splitting strategy. This determines which splitting method is used."},"description_prompt":{"default":"Describe the video segment in detail.","description":"The prompt to use for description generation.","title":"Description Prompt","type":"string"},"time_split_interval":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":10,"description":"Interval in seconds for 'time' splitting. Used when split_method='time'.","title":"Time Split Interval"},"silence_db_threshold":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"The decibel level below which audio is considered silent. Used when split_method='silence'. Recommended value: -40 (auto-applied if not specified). Lower values (e.g., -50) detect more silence, higher values (e.g., -30) detect less.","title":"Silence Db Threshold"},"scene_detection_threshold":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"The threshold for scene detection (0.0-1.0). Used when split_method='scene'. Recommended value: 0.5 (auto-applied if not specified). Lower values (e.g., 0.3) detect more scenes, higher values (e.g., 0.7) detect fewer scenes.","title":"Scene Detection Threshold"},"run_transcription":{"default":false,"description":"Whether to run transcription on video segments.","title":"Run Transcription","type":"boolean"},"transcription_language":{"default":"en","description":"The language of the transcription. Used when run_transcription is True.","title":"Transcription Language","type":"string"},"run_video_description":{"default":false,"description":"Whether to generate descriptions for video segments. OPTIMIZED: Defaults to False as descriptions add 1-2 minutes. Enable only when needed.","title":"Run Video Description","type":"boolean"},"run_transcription_embedding":{"default":false,"description":"Whether to generate embeddings for transcriptions. Useful for semantic search over spoken content.","title":"Run Transcription Embedding","type":"boolean"},"run_multimodal_embedding":{"default":true,"description":"Whether to generate multimodal embeddings for all content types (video/image/gif/text). Uses Google Vertex AI to create unified 1408D embeddings in a shared semantic space. Useful for cross-modal semantic search across all media types.","title":"Run Multimodal Embedding","type":"boolean"},"run_ocr":{"default":false,"description":"Whether to run OCR to extract text from video frames. OPTIMIZED: Defaults to False as OCR adds significant processing time. Enable only when text extraction from video is required.","title":"Run Ocr","type":"boolean"},"sensitivity":{"default":"low","description":"The sensitivity of the scene detection.","title":"Sensitivity","type":"string"},"enable_thumbnails":{"default":true,"description":"Whether to generate thumbnail images for video segments and images. Thumbnails provide visual previews for navigation and UI display. For videos: Extracts a frame from each segment. For images: Creates an optimized thumbnail version. ","title":"Enable Thumbnails","type":"boolean"},"use_cdn":{"default":false,"description":"Whether to use CloudFront CDN for thumbnail delivery. When True: Uploads to public bucket and returns CloudFront URLs. When False (default): Uploads to private bucket with presigned S3 URLs. Benefits of CDN: faster global delivery, permanent URLs, reduced bandwidth costs. Requires CLOUDFRONT_PUBLIC_DOMAIN to be configured in settings. Only applies when enable_thumbnails=True.","title":"Use Cdn","type":"boolean"},"generation_config":{"$ref":"#/$defs/GenerationConfig"},"response_shape":{"anyOf":[{"type":"string"},{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"OPTIONAL. Define custom structured output using Gemini's JSON mode. NOT REQUIRED - by default, descriptions are stored as plain text. When provided, Gemini will extract structured data matching this schema. \n\nTwo modes supported:\n1. Natural language prompt (string): Describe desired output in plain English\n   - Gemini automatically infers JSON schema from your description\n   - Example: 'Extract product names, colors, and aesthetic labels'\n\n2. Explicit JSON schema (dict): Provide complete JSON schema for output structure\n   - Full control over output structure, types, and constraints\n   - Use response_mime_type='application/json' in generation_config\n   - Example: {'type': 'object', 'properties': {'products': {'type': 'array', ...}}}\n\n\nUse when:\n  - Need to extract structured product/entity information from videos\n  - Want consistent, parseable output format (not free-form text)\n  - Require specific fields like visibility_percentage, product categories, etc.\n  - Building e-commerce, fashion, or product discovery applications\n\n\nOutput fields are automatically added to collection schema and stored in document metadata.\nNote: When using response_shape, set description_prompt to describe the extraction task.\n","examples":["Extract product names, colors, materials, and aesthetic style labels from this fashion segment",{"properties":{"products":{"items":{"properties":{"name":{"type":"string"},"category":{"type":"string"},"visibility_percentage":{"maximum":100,"minimum":0,"type":"integer"}},"type":"object"},"type":"array"},"aesthetic":{"type":"string"}},"type":"object"},null],"title":"Response Shape"}},"title":"MultimodalExtractorParams","type":"object"},"supported_input_types":["video","image","text","string"],"max_inputs":{"video":1,"image":1,"text":1,"string":1},"default_parameters":{},"costs":{"tier":4,"tier_label":"PREMIUM","rates":[{"unit":"minute","credits_per_unit":50,"description":"Video processing per minute"},{"unit":"image","credits_per_unit":5,"description":"Image analysis"},{"unit":"1k_tokens","credits_per_unit":2,"description":"Text processing per 1K tokens"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://multimodal_extractor@v1/vertex_multimodal_embedding","name":"vertex_multimodal_embedding","description":"Vector index for multimodal content embeddings (video/image/gif/text).","type":"single","index":{"name":"multimodal_extractor_v1_multimodal_embedding","description":"Dense vector embedding for multimodal content in unified embedding space.","dimensions":1408,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["video","text","image"],"inference_name":"google__vertex_multimodal","inference_service_id":"google/vertex-multimodal","purpose":null,"vector_name_override":null}},{"feature_uri":"mixpeek://multimodal_extractor@v1/multilingual_e5_large_instruct_v1","name":"multilingual_e5_large_instruct_v1","description":"Vector index for transcription embeddings.","type":"single","index":{"name":"multimodal_extractor_v1_transcription_embedding","description":"Dense vector embedding for transcriptions.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["string","text"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["start_time","end_time"]},{"feature_extractor_name":"image_extractor","version":"v1","feature_extractor_id":"image_extractor_v1","description":"Image embedding extractor using Google SigLIP (768D). Generates dense vector embeddings for images and PDFs, with text query support in the same contrastive latent space. Optimized for visual similarity search, product matching, and text-to-image search.","icon":"image","source":"builtin","input_schema":{"description":"Input schema for image extractor.","examples":[{"image":"s3://bucket/images/product.jpg"}],"properties":{"image":{"description":"Image or PDF URL or S3 path for embedding generation.","title":"Image","type":"string"}},"required":["image"],"title":"ImageExtractorInput","type":"object"},"output_schema":{"additionalProperties":true,"description":"Output schema for image extractor.","properties":{"image_extractor_v1_embedding":{"description":"SigLIP image embedding (768-d).","items":{"type":"number"},"maxItems":768,"minItems":768,"title":"Image Extractor V1 Embedding","type":"array"},"processing_time_ms":{"description":"Processing time in milliseconds","title":"Processing Time Ms","type":"number"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of the thumbnail image","title":"Thumbnail Url"},"page_number":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Page number for PDF sources (1-indexed)","title":"Page Number"},"total_pages":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Total number of pages in the PDF","title":"Total Pages"}},"required":["image_extractor_v1_embedding","processing_time_ms"],"title":"ImageExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for the Image Extractor.","examples":[{"enable_thumbnails":true,"extractor_type":"image_extractor","use_cdn":false}],"properties":{"extractor_type":{"const":"image_extractor","default":"image_extractor","description":"Discriminator field for parameter type identification.","title":"Extractor Type","type":"string"},"enable_thumbnails":{"default":true,"description":"Whether to generate thumbnail images.","title":"Enable Thumbnails","type":"boolean"},"use_cdn":{"default":false,"description":"Whether to use CloudFront CDN for thumbnail delivery.","title":"Use Cdn","type":"boolean"}},"title":"ImageExtractorParams","type":"object"},"supported_input_types":["image","pdf"],"max_inputs":{"image":1,"pdf":1},"default_parameters":{},"costs":{"tier":1,"tier_label":"SIMPLE","rates":[{"unit":"image","credits_per_unit":2,"description":"Base cost per image processed"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://image_extractor@v1/google_siglip_base_v1","name":"google_siglip_base_v1","description":"Vector index for image and text embeddings (SigLIP 768-d, shared latent space)","type":"single","index":{"name":"image_extractor_v1_embedding","description":"Dense vector embedding for image and text content using SigLIP (shared contrastive space).","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image","pdf","text","string"],"inference_name":"google__siglip_base_patch16_224","inference_service_id":"google/siglip-base-patch16-224","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":[]},{"feature_extractor_name":"face_identity_extractor","version":"v1","feature_extractor_id":"face_identity_extractor_v1","description":"Production-grade face recognition using state-of-the-art models (SCRFD + ArcFace). Detects faces, aligns to canonical template, generates 512-dimensional embeddings with 99.8%+ verification accuracy (LFW benchmark). Supports images, videos, and PDFs.\n\n**Pipeline Steps:**\n1. Filter dataset to collection (if collection_id provided)\n2. **Content Type Routing:**\n   - **Images:** Direct to Step 3\n   - **Videos:** Frame extraction (sampling at video_sampling_fps) → Step 3\n   - **PDFs:** Page rendering → Step 3\n   - **Mixed:** Branch by type, process separately, then union\n3. Face detection using SCRFD\n   - Detect all faces per image/frame/page\n   - Extract 5-point facial landmarks (eyes, nose, mouth)\n   - Filter by min_face_size and detection_threshold\n4. 5-point affine face alignment\n   - Warp face to canonical 112x112 template\n   - Mandatory for consistent embeddings\n5. ArcFace embedding generation (512D, L2 normalized)\n   - arcface_r100 model\n   - Cosine similarity for matching\n6. **Conditional:** Quality scoring (if enable_quality_scoring=true)\n   - Assess blur, size, landmark confidence\n   - Filter by quality_threshold if specified\n7. **Conditional:** Video deduplication (if video_deduplication=true AND video content)\n   - Remove duplicate faces across frames\n   - Threshold-based similarity matching\n   - Track face timelines in video\n8. Output validation\n9. **Output:** Per-face documents with embeddings, bbox, landmarks, and quality scores\n\n**Use for:** Employee verification, photo organization, face clustering, surveillance, identity systems.\n\n**Not for:** General image search (use image_extractor), object detection (use multimodal_extractor).","icon":"user-circle","source":"builtin","input_schema":{"description":"Input schema for face identity extractor.\n\nProvide exactly ONE of: image, video, video_frame, or pdf.","examples":[{"description":"Single portrait","image":"s3://photos/john-doe-portrait.jpg"},{"description":"Video segment","video":"s3://segments/interview-clip.mp4"},{"description":"PDF document with photo","pdf":"s3://documents/id-card.pdf"}],"properties":{"image":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Image URL or S3 path containing faces. Formats: JPEG, PNG, WebP, BMP. Resolution: 640px+ recommended.","examples":["s3://bucket/photos/portrait.jpg","https://example.com/photo.png"],"title":"Image"},"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Video URL or S3 path. Subject to max_video_length limit. Formats: MP4, MOV, AVI, MKV, WebM. Sampling controlled by video_sampling_fps.","examples":["s3://bucket/videos/interview-60sec.mp4","s3://bucket/segments/clip-5sec.mp4"],"title":"Video"},"video_frame":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Single video frame URL or S3 path. Treated as image.","examples":["s3://bucket/frames/frame_001.jpg"],"title":"Video Frame"},"pdf":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"PDF URL or S3 path containing faces in pages. Each page is converted to an image and processed for faces. Useful for ID documents, resumes, forms with photos.","examples":["s3://bucket/documents/id-card.pdf","s3://bucket/resumes/profile-with-photo.pdf"],"title":"Pdf"}},"title":"FaceIdentityExtractorInput","type":"object"},"output_schema":{"additionalProperties":true,"description":"Output schema for face identity extractor.\n\nEach document represents ONE detected face with its aligned embedding.\n\nNote: document_id, collection_id, and source_object_id are added automatically\nby the system and should not be included in extractor output.","properties":{"face_identity_extractor_v1_embedding":{"description":"ArcFace face embedding (512-d L2-normalized vector). Use cosine similarity for face matching. Similarity > 0.25-0.30 indicates same person.","items":{"type":"number"},"maxItems":512,"minItems":512,"title":"Face Identity Extractor V1 Embedding","type":"array"},"face_index":{"description":"Index of this face in source image (0-based)","title":"Face Index","type":"integer"},"bbox":{"additionalProperties":{"type":"number"},"description":"Face bounding box {x1, y1, x2, y2, width, height}","title":"Bbox","type":"object"},"detection_score":{"description":"SCRFD detection confidence (0.0-1.0)","title":"Detection Score","type":"number"},"landmarks":{"additionalProperties":{"items":{"type":"number"},"type":"array"},"description":"5 facial landmarks for alignment","title":"Landmarks","type":"object"},"quality_score":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Face quality score (0.0-1.0)","title":"Quality Score"},"quality_components":{"anyOf":[{"additionalProperties":{"type":"number"},"type":"object"},{"type":"null"}],"default":null,"description":"Quality component scores","title":"Quality Components"},"aligned_face_crop":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Base64 aligned 112×112 face crop","title":"Aligned Face Crop"},"source_frame_thumbnail":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Base64 resized source frame thumbnail for display with bbox overlay","title":"Source Frame Thumbnail"},"source_frame_width":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Original source frame width in pixels","title":"Source Frame Width"},"source_frame_height":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Original source frame height in pixels","title":"Source Frame Height"},"frame_number":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Frame number in source video","title":"Frame Number"},"timestamp":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Timestamp in source video (seconds)","title":"Timestamp"},"page_number":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Page number in source PDF (0-based)","title":"Page Number"},"embedding_model":{"description":"Embedding model used","title":"Embedding Model","type":"string"},"detection_model":{"description":"Detection model used","title":"Detection Model","type":"string"},"processing_time_ms":{"description":"Processing time (milliseconds)","title":"Processing Time Ms","type":"number"}},"required":["face_identity_extractor_v1_embedding","face_index","bbox","detection_score","landmarks","embedding_model","detection_model","processing_time_ms"],"title":"FaceIdentityExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for the Face Identity Extractor.\n\nThe Face Identity Extractor processes images or video frames to detect, align,\nand embed faces using production-grade SOTA models (SCRFD + ArcFace).\n\nCore Pipeline:\n1. SCRFD Detection → Bounding boxes + 5 landmarks\n2. 5-Point Affine Alignment → 112×112 canonical face\n3. ArcFace Embedding → 512-d L2-normalized vector\n4. Optional Quality Scoring → Filter low-quality faces\n\nUse Cases:\n    - Face verification (1:1 matching)\n    - Face identification (1:N search)\n    - Face clustering (group photos by person)\n    - Duplicate face detection","examples":[{"description":"Employee verification (high quality, 1:1 matching)","detection_model":"scrfd_2.5g","detection_threshold":0.7,"enable_quality_scoring":true,"extractor_type":"face_identity_extractor","max_faces_per_image":1,"min_face_size":40,"normalize_embeddings":true,"output_mode":"per_face","quality_threshold":0.5,"use_case":"Corporate access control, employee ID photos for badge matching"},{"description":"Photo library organization (multiple faces)","detection_model":"scrfd_2.5g","detection_threshold":0.6,"enable_quality_scoring":true,"extractor_type":"face_identity_extractor","max_faces_per_image":null,"min_face_size":30,"output_mode":"per_face","store_detection_metadata":true,"use_case":"Personal photo management: group photos by person"}],"properties":{"extractor_type":{"const":"face_identity_extractor","default":"face_identity_extractor","description":"Discriminator field for parameter type identification. Must be 'face_identity_extractor'.","title":"Extractor Type","type":"string"},"detection_model":{"default":"scrfd_2.5g","description":"SCRFD model for face detection. 'scrfd_500m': Fastest (2-3ms). 'scrfd_2.5g': Balanced (5-7ms), recommended. 'scrfd_10g': Highest accuracy (10-15ms).","enum":["scrfd_500m","scrfd_2.5g","scrfd_10g"],"title":"Detection Model","type":"string"},"min_face_size":{"default":20,"description":"Minimum face size in pixels to detect. 20px: Balanced. 40px: Higher quality. 10px: Maximum recall.","maximum":200,"minimum":10,"title":"Min Face Size","type":"integer"},"detection_threshold":{"default":0.5,"description":"Confidence threshold for face detection (0.0-1.0).","maximum":1.0,"minimum":0.0,"title":"Detection Threshold","type":"number"},"max_faces_per_image":{"anyOf":[{"minimum":1,"type":"integer"},{"type":"null"}],"default":null,"description":"Maximum number of faces to process per image. None: Process all.","title":"Max Faces Per Image"},"normalize_embeddings":{"default":true,"description":"L2-normalize embeddings to unit vectors (recommended).","title":"Normalize Embeddings","type":"boolean"},"enable_quality_scoring":{"default":true,"description":"Compute quality scores (blur, size, landmarks). Adds ~5ms per face.","title":"Enable Quality Scoring","type":"boolean"},"quality_threshold":{"anyOf":[{"maximum":1.0,"minimum":0.0,"type":"number"},{"type":"null"}],"default":null,"description":"Minimum quality score to index faces. None: Index all faces. 0.5: Moderate filtering. 0.7: High quality only.","title":"Quality Threshold"},"max_video_length":{"default":60,"description":"Maximum video length in seconds. 60: Default. 10: Recommended for retrieval. 300: Maximum (extraction only).","maximum":300,"minimum":1,"title":"Max Video Length","type":"integer"},"video_sampling_fps":{"anyOf":[{"maximum":60.0,"minimum":0.1,"type":"number"},{"type":"null"}],"default":1.0,"description":"Frames per second to sample from video. 1.0: One frame per second (recommended).","title":"Video Sampling Fps"},"video_deduplication":{"default":true,"description":"Remove duplicate faces across video frames (extraction only). Reduces 90-95% redundancy. NOT used in retrieval.","title":"Video Deduplication","type":"boolean"},"video_deduplication_threshold":{"default":0.8,"description":"Cosine similarity threshold for deduplication. 0.8: Conservative (default).","maximum":1.0,"minimum":0.0,"title":"Video Deduplication Threshold","type":"number"},"output_mode":{"default":"per_face","description":"'per_face': One document per face (recommended). 'per_image': One doc per image with faces array.","enum":["per_face","per_image"],"title":"Output Mode","type":"string"},"include_face_crops":{"default":false,"description":"Include aligned 112×112 face crops as base64. Adds ~5KB per face.","title":"Include Face Crops","type":"boolean"},"include_source_frame_thumbnail":{"default":false,"description":"Include resized source frame/image as base64 thumbnail (~15-30KB per face). Used for display with bounding box overlay.","title":"Include Source Frame Thumbnail","type":"boolean"},"store_detection_metadata":{"default":true,"description":"Store bbox, landmarks, detection scores. Recommended for debugging.","title":"Store Detection Metadata","type":"boolean"}},"title":"FaceIdentityExtractorParams","type":"object"},"supported_input_types":["image","video","pdf"],"max_inputs":{"image":1,"video":1,"pdf":1},"default_parameters":{},"costs":{"tier":3,"tier_label":"COMPLEX","rates":[{"unit":"image","credits_per_unit":5,"description":"Base cost per image processed"},{"unit":"face","credits_per_unit":5,"description":"Additional cost per face detected"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://face_identity_extractor@v1/insightface__arcface","name":"insightface__arcface","description":"Vector index for face identity embeddings (ArcFace 512-d)","type":"single","index":{"name":"face_identity_extractor_v1_embedding","description":"Dense vector embedding for face identity verification and search. 512-dimensional L2-normalized ArcFace embeddings. Use cosine similarity for face matching. Similarity threshold: 0.25-0.30 for same person verification.","dimensions":512,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image"],"inference_name":"insightface__arcface","inference_service_id":"insightface/arcface","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["face_index","frame_number","page_number"]},{"feature_extractor_name":"document_graph_extractor","version":"v1","feature_extractor_id":"document_graph_extractor_v1","description":"Extracts spatial blocks from PDFs with layout classification and confidence scoring. Decomposes documents into paragraphs, tables, forms, lists, headers, footers, figures, and handwritten content. Includes optional VLM correction for low-confidence blocks. Best for archival documents, scanned files, and documents requiring spatial understanding.\n\n**Pipeline Steps:**\n1. Filter dataset to collection (if collection_id provided)\n2. Find and resolve PDF URL from row data\n3. **Layout Detection Mode Fork:**\n   - **If use_layout_detection=true (NEW - ML-based):**\n     a. PaddleOCR layout detection (finds ALL elements: text, images, tables)\n     b. Skip to Step 4 (object_type already set by detector)\n   - **If use_layout_detection=false (LEGACY - Text-only):**\n     a. PyMuPDF span extraction (text with bounding boxes)\n     b. Spatial clustering (group nearby spans into logical blocks)\n     c. Layout classification (rule-based: paragraph, table, form, etc.)\n4. Confidence scoring (A/B/C/D tags based on extraction quality)\n5. Text cleaning (remove OCR artifacts, normalize whitespace)\n6. **Conditional:** Page rendering (if generate_thumbnails=true OR use_vlm_correction=true)\n   - Full page and segment thumbnails at configured DPI\n7. **Conditional:** VLM correction (if use_vlm_correction=true AND not fast_mode AND confidence C/D)\n   - Gemini/OpenAI/Anthropic vision models correct low-confidence text\n8. **Conditional:** Text embedding (if run_text_embedding=true)\n   - E5-Large embeddings (1024D) for semantic search\n9. **Output:** Block-level documents with text, layout type, bbox, confidence, and embeddings\n\n**Use for:** Archival documents, scanned PDFs, forms processing, structured extraction, document understanding.\n\n**Not for:** Simple text extraction (use text_extractor), images (use image_extractor).","icon":"file-scan","source":"builtin","input_schema":{"description":"Input schema for the document graph extractor.\n\nDefines the PDF file that will be processed into spatial blocks.","examples":[{"description":"Archival document for block extraction","pdf":"s3://archive-docs/hoffman-file-part-35.pdf"}],"properties":{"pdf":{"description":"REQUIRED. URL or S3 path to PDF file for processing. PDF will be decomposed into spatial blocks with layout classification. Supports any PDF version, both digital and scanned documents. Examples: 's3://bucket/document.pdf', 'https://example.com/report.pdf'","examples":["s3://my-bucket/documents/fbi-file-part-35.pdf","https://storage.googleapis.com/my-docs/archival-record.pdf"],"title":"Pdf","type":"string"}},"required":["pdf"],"title":"DocumentGraphExtractorInput","type":"object"},"output_schema":{"$defs":{"BoundingBox":{"description":"Bounding box coordinates for a block.","properties":{"x0":{"description":"Left edge x-coordinate","title":"X0","type":"number"},"y0":{"description":"Top edge y-coordinate","title":"Y0","type":"number"},"x1":{"description":"Right edge x-coordinate","title":"X1","type":"number"},"y1":{"description":"Bottom edge y-coordinate","title":"Y1","type":"number"}},"required":["x0","y0","x1","y1"],"title":"BoundingBox","type":"object"},"ConfidenceTag":{"description":"Confidence tags for extraction quality.","enum":["A","B","C","D"],"title":"ConfidenceTag","type":"string"},"ObjectType":{"description":"Block/object types produced by document graph extractor.","enum":["paragraph","table","form","list","header","footer","figure","handwritten"],"title":"ObjectType","type":"string"}},"description":"Output schema for a single block produced by the document graph extractor.\n\nEach block represents a spatially-clustered region of the document with\nlayout classification and confidence scoring.","examples":[{"bbox":{"x0":14.0,"x1":551.0,"y0":272.0,"y1":375.0},"block_index":2,"confidence_tag":"A","description":"High-confidence paragraph block","object_type":"paragraph","overall_confidence":0.85,"page_number":1,"source_file":"abbie-hoffman-part-35.pdf","text_corrected":"HOFFMAN has been a participant in conversations...","text_raw":"HOFFMAN has been a participant in conversations...","total_pages":15}],"properties":{"page_number":{"description":"Page number in original PDF (1-indexed)","title":"Page Number","type":"integer"},"object_type":{"$ref":"#/$defs/ObjectType","description":"Classified type of this block. PARAGRAPH: Regular text. TABLE: Tabular data. FORM: Form fields. LIST: Bulleted/numbered lists. HEADER/FOOTER: Page headers/footers. FIGURE: Images/diagrams. HANDWRITTEN: Handwritten content."},"block_index":{"description":"Block index within the page (0-indexed)","title":"Block Index","type":"integer"},"bbox":{"$ref":"#/$defs/BoundingBox","description":"Bounding box coordinates for this block on the page"},"text_raw":{"description":"Original extracted text from the block (before cleaning)","title":"Text Raw","type":"string"},"text_corrected":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Cleaned and/or VLM-corrected text. Contains cleaned text for high-confidence blocks, VLM-corrected text for low-confidence blocks (if enabled).","title":"Text Corrected"},"overall_confidence":{"description":"Extraction confidence score (0.0-1.0)","maximum":1.0,"minimum":0.0,"title":"Overall Confidence","type":"number"},"confidence_tag":{"$ref":"#/$defs/ConfidenceTag","description":"Confidence category. A: >=0.85 (high). B: >=0.70 (medium). C: >=0.50 (low, may need verification). D: <0.50 (very low, needs VLM)."},"document_graph_extractor_v1_text_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embedding for text content (1024-dim E5)","title":"Document Graph Extractor V1 Text Embedding"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL to full page thumbnail (low-res image of entire page)","title":"Thumbnail Url"},"segment_thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL to segment thumbnail (cropped to block's bounding box)","title":"Segment Thumbnail Url"},"total_pages":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Total pages in source PDF","title":"Total Pages"},"source_file":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Original source file name","title":"Source File"}},"required":["page_number","object_type","block_index","bbox","text_raw","overall_confidence","confidence_tag"],"title":"DocumentGraphExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for the document graph extractor.\n\nThis extractor decomposes PDFs into spatial blocks with layout classification,\nconfidence scoring, and optional VLM correction for degraded documents.\n\n**When to Use**:\n    - Historical/archival document processing (FBI files, old records)\n    - Scanned documents with mixed quality\n    - Documents requiring spatial understanding (forms, tables, multi-column)\n    - When you need block-level granularity with bounding boxes\n    - When confidence scoring is needed for downstream filtering\n\n**When NOT to Use**:\n    - Simple text-only documents -> Use text_extractor instead\n    - When page-level granularity is sufficient -> Use pdf_extractor instead\n    - Real-time processing requirements -> VLM correction adds latency","examples":[{"description":"Fast processing mode (no VLM, maximum throughput)","extractor_type":"document_graph_extractor","fast_mode":true,"generate_thumbnails":true,"layout_detector":"pymupdf","run_text_embedding":true,"use_case":"High-volume document ingestion where speed matters more than perfect accuracy","use_layout_detection":true},{"description":"Archival documents with VLM correction (recommended for old scans)","extractor_type":"document_graph_extractor","layout_detector":"pymupdf","min_confidence_for_vlm":0.6,"render_dpi":150,"run_text_embedding":true,"use_case":"Historical archives, FBI files, old scanned documents with degraded quality","use_layout_detection":true,"use_vlm_correction":true,"vlm_model":"gemini-2.5-flash","vlm_provider":"google"},{"description":"SOTA accuracy mode with Docling (best for tables/figures)","extractor_type":"document_graph_extractor","fast_mode":true,"generate_thumbnails":true,"layout_detector":"docling","run_text_embedding":true,"use_case":"Documents with complex tables, figures, or requiring accurate semantic typing","use_layout_detection":true}],"properties":{"extractor_type":{"const":"document_graph_extractor","default":"document_graph_extractor","description":"Discriminator field for parameter type identification. Must be 'document_graph_extractor'.","title":"Extractor Type","type":"string"},"use_layout_detection":{"default":true,"description":"Enable ML-based layout detection to find ALL document elements (text, images, tables, figures). When enabled, uses the configured layout_detector to detect and extract both text regions AND non-text elements (scanned images, figures, charts) as separate documents. **Recommended for**: Scanned documents, image-heavy PDFs, mixed content documents. **When disabled**: Falls back to text-only extraction (faster but misses images). Default: True (detects all elements including images).","title":"Use Layout Detection","type":"boolean"},"layout_detector":{"default":"pymupdf","description":"Layout detection engine to use when use_layout_detection=True. 'pymupdf': Fast, rule-based detection using PyMuPDF heuristics (~15 pages/sec). 'docling': SOTA ML-based detection using IBM Docling with DiT model (~3-8 sec/doc). **Docling advantages**: Better semantic type detection (section_header vs paragraph), true table structure extraction (rows/cols), more accurate figure detection. **PyMuPDF advantages**: Much faster, lower memory usage, simpler dependencies. Default: 'pymupdf' for speed. Use 'docling' for accuracy-critical applications.","enum":["pymupdf","docling"],"title":"Layout Detector","type":"string"},"vertical_threshold":{"default":15.0,"description":"Maximum vertical gap (in points) between lines to be grouped in same block. Increase for looser grouping, decrease for tighter blocks. Default 15pt works well for standard documents.","maximum":100.0,"minimum":1.0,"title":"Vertical Threshold","type":"number"},"horizontal_threshold":{"default":50.0,"description":"Maximum horizontal distance (in points) for overlap detection. Affects column detection and block merging. Increase for wider columns, decrease for narrow layouts.","maximum":200.0,"minimum":1.0,"title":"Horizontal Threshold","type":"number"},"min_text_length":{"default":20,"description":"Minimum text length (characters) to keep a block. Blocks with less text are filtered out. Helps remove noise and tiny fragments.","maximum":500,"minimum":1,"title":"Min Text Length","type":"integer"},"base_confidence":{"default":0.85,"description":"Base confidence score for embedded (native) text. Penalties are subtracted for OCR artifacts, encoding issues, etc.","maximum":1.0,"minimum":0.0,"title":"Base Confidence","type":"number"},"min_confidence_for_vlm":{"default":0.6,"description":"Confidence threshold below which VLM correction is triggered. Blocks with confidence < this value get sent to VLM for correction. Only applies when use_vlm_correction=True.","maximum":1.0,"minimum":0.0,"title":"Min Confidence For Vlm","type":"number"},"use_vlm_correction":{"default":true,"description":"Enable VLM (Vision Language Model) correction for low-confidence blocks. Uses Gemini/GPT-4V to correct OCR errors by analyzing the page image. Significantly slower (~1 page/sec) but improves accuracy for degraded docs.","title":"Use Vlm Correction","type":"boolean"},"fast_mode":{"default":false,"description":"Skip VLM correction entirely for maximum throughput (~15 pages/sec). Overrides use_vlm_correction. Use when speed is more important than accuracy.","title":"Fast Mode","type":"boolean"},"vlm_provider":{"default":"google","description":"LLM provider for VLM correction. Options: 'google' (Gemini), 'openai' (GPT-4V), 'anthropic' (Claude). Google recommended for best vision quality.","title":"Vlm Provider","type":"string"},"vlm_model":{"default":"gemini-2.5-flash","description":"Specific model for VLM correction. Examples: 'gemini-2.5-flash', 'gpt-4o', 'claude-3-5-sonnet'.","title":"Vlm Model","type":"string"},"llm_api_key":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"API key for VLM correction (BYOK - Bring Your Own Key). Supports:\n- Direct key: 'sk-proj-abc123...'\n- Secret reference: '{{SECRET.openai_api_key}}'\n\nWhen using secret reference, the key is loaded from your organization's secrets vault at runtime. Store secrets via POST /v1/organizations/secrets.\n\nIf not provided, uses Mixpeek's default API keys.","title":"Llm Api Key"},"run_text_embedding":{"default":true,"description":"Generate text embeddings for semantic search over block content. Uses E5-Large (1024-dim) for multilingual support.","title":"Run Text Embedding","type":"boolean"},"render_dpi":{"default":150,"description":"DPI for page rendering (used for VLM correction). 72: Fast, lower quality. 150: Balanced (recommended). 300: High quality, slower.","maximum":300,"minimum":72,"title":"Render Dpi","type":"integer"},"generate_thumbnails":{"default":true,"description":"Generate thumbnail images for blocks. Useful for visual previews and UI display.","title":"Generate Thumbnails","type":"boolean"},"thumbnail_mode":{"default":"both","description":"Thumbnail generation mode. 'full_page': Low-res thumbnail of entire page. 'segment': Cropped thumbnail of just the block's bounding box. 'both': Generate both types (recommended for flexibility).","title":"Thumbnail Mode","type":"string"},"thumbnail_dpi":{"default":72,"description":"DPI for thumbnail generation. Lower DPI = smaller files. 72: Standard web quality. 36: Very small thumbnails.","maximum":150,"minimum":36,"title":"Thumbnail Dpi","type":"integer"}},"title":"DocumentGraphExtractorParams","type":"object"},"supported_input_types":["pdf"],"max_inputs":{"pdf":1},"default_parameters":{},"costs":{"tier":2,"tier_label":"MODERATE","rates":[{"unit":"page","credits_per_unit":5,"description":"Document page processing with layout analysis"},{"unit":"extraction","credits_per_unit":20,"description":"VLM correction per low-confidence block"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://document_graph_extractor@v1/intfloat__multilingual_e5_large_instruct","name":"intfloat__multilingual_e5_large_instruct","description":"Vector index for document graph text embeddings","type":"single","index":{"name":"document_graph_extractor_v1_text_embedding","description":"Dense vector embedding for block text content","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text","string"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["page_number","object_type","block_index"]},{"feature_extractor_name":"passthrough_extractor","version":"v1","feature_extractor_id":"passthrough_extractor_v1","description":"Minimal passthrough extractor for simple object storage. No ML processing - just canonicalization and data preservation. Use when you need to store objects without feature extraction.","icon":"arrow-right","source":"builtin","input_schema":{"description":"Input schema for passthrough extractor.\n\nAccepts any content type - just passes it through without processing.","properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or path to content (any type supported).","title":"Content"},"data":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Direct data payload to store.","title":"Data"}},"title":"PassthroughExtractorInput","type":"object"},"output_schema":{"description":"Output schema for passthrough extractor.\n\nPreserves original data with minimal transformation.","properties":{"content_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Canonicalized URL of stored content.","title":"Content Url"},"content_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Detected content type.","title":"Content Type"},"size_bytes":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Content size in bytes.","title":"Size Bytes"},"metadata":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Preserved metadata from source object.","title":"Metadata"}},"title":"PassthroughExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for passthrough extractor.\n\nMinimal configuration - just passes data through with canonicalization.","properties":{"extractor_type":{"const":"passthrough_extractor","default":"passthrough_extractor","description":"Discriminator field for parameter type identification.","title":"Extractor Type","type":"string"},"preserve_metadata":{"default":true,"description":"Preserve source object metadata in output document.","title":"Preserve Metadata","type":"boolean"}},"title":"PassthroughExtractorParams","type":"object"},"supported_input_types":["text","image","video","audio","pdf"],"max_inputs":{"text":1,"image":1,"video":1,"audio":1,"pdf":1},"default_parameters":{},"costs":null,"required_vector_indexes":null,"required_payload_indexes":null,"position_fields":[]},{"feature_extractor_name":"ip_frame_extractor","version":"v1","feature_extractor_id":"ip_frame_extractor_v1","description":"Extracts SigLIP SO400M (1152-d) embeddings from video frames and images for IP safety brand/logo detection.\n\n**Pipeline:**\n1. Video → frame extraction (uniform sampling + scene change detection)\n2. Frame → SigLIP SO400M embedding (1152-d, L2-normalized)\n3. One document per frame stored in Qdrant\n\n**IP Safety workflow:**\n- Ingest video with this extractor + face_identity_extractor\n- Create brand text corpus namespace (brand names → SigLIP text embeddings)\n- Create celebrity face corpus namespace (face photos → ArcFace embeddings)\n- Retriever: feature_search frame embeddings vs brand corpus\n- Retriever: feature_search face embeddings vs celebrity corpus\n- Retriever: llm_enrich for visual verification of candidates\n\n**Use for:** IP safety, brand detection, logo search, visual content moderation.\n**Not for:** General image search (use image_extractor), face recognition (use face_identity_extractor).","icon":"shield-check","source":"builtin","input_schema":{"description":"Input schema. Provide image or video.","examples":[{"description":"Single image","image":"s3://bucket/photo.jpg"},{"description":"Video","video":"s3://bucket/content.mp4"}],"properties":{"image":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Image URL or S3 path. Formats: JPEG, PNG, WebP.","examples":["s3://bucket/frames/frame_001.jpg"],"title":"Image"},"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Video URL or S3 path. Frames sampled at video_sampling_fps.","examples":["s3://bucket/videos/content.mp4"],"title":"Video"}},"title":"IPFrameExtractorInput","type":"object"},"output_schema":{"additionalProperties":true,"description":"Output schema. One document per frame/region with SigLIP SO400M embedding.","properties":{"ip_frame_extractor_v1_embedding":{"description":"SigLIP SO400M image embedding (1152-d, L2-normalized). Use cosine similarity for zero-shot brand/logo matching against brand text corpus embeddings.","items":{"type":"number"},"maxItems":1152,"minItems":1152,"title":"Ip Frame Extractor V1 Embedding","type":"array"},"frame_index":{"description":"Frame index within the source video (0-based). 0 for images.","title":"Frame Index","type":"integer"},"timestamp":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Timestamp in source video (seconds). None for images.","title":"Timestamp"},"processing_time_ms":{"description":"Processing time in milliseconds.","title":"Processing Time Ms","type":"number"},"region_index":{"default":0,"description":"Region index within frame. 0 = global/fallback, 1+ = detected regions.","title":"Region Index","type":"integer"},"bbox":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Bounding box [x1, y1, x2, y2] normalized [0,1]. None for global embedding.","title":"Bbox"},"detection_score":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"GroundingDINO confidence for this region. None for global embedding.","title":"Detection Score"},"detection_label":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"GroundingDINO label (e.g., 'logo', 'brand'). None for global embedding.","title":"Detection Label"},"ocr_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"OCR-extracted text from region (if enable_ocr_verification=True).","title":"Ocr Text"},"ocr_confidence":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"EasyOCR confidence. None if OCR disabled or no text found.","title":"Ocr Confidence"}},"required":["ip_frame_extractor_v1_embedding","frame_index","processing_time_ms"],"title":"IPFrameExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for IP Frame Extractor.\n\nExtracts SigLIP SO400M (1152-d) embeddings from images or video frames.\nThese embeddings enable zero-shot brand/logo matching when searched against\na brand text corpus using feature_search retriever stages.","examples":[{"description":"IP safety scan — standard video","extractor_type":"ip_frame_extractor","max_video_length":300,"scene_change_detection":true,"video_sampling_fps":0.5},{"description":"IP safety scan — image batch","extractor_type":"ip_frame_extractor"}],"properties":{"extractor_type":{"const":"ip_frame_extractor","default":"ip_frame_extractor","description":"Discriminator field. Must be 'ip_frame_extractor'.","title":"Extractor Type","type":"string"},"video_sampling_fps":{"default":1.0,"description":"Frames per second to sample from video. 1.0: One frame per second (recommended for IP safety). 0.5: Every 2 seconds (faster, sufficient for most content).","maximum":10.0,"minimum":0.1,"title":"Video Sampling Fps","type":"number"},"max_video_length":{"default":300,"description":"Maximum video length in seconds to process.","maximum":600,"minimum":1,"title":"Max Video Length","type":"integer"},"max_frames":{"default":300,"description":"Maximum total frames to extract per video.","maximum":1000,"minimum":1,"title":"Max Frames","type":"integer"},"scene_change_detection":{"default":true,"description":"Extract additional frames at scene changes (hard cuts). Captures content that uniform sampling might miss.","title":"Scene Change Detection","type":"boolean"},"scene_change_threshold":{"default":200.0,"description":"Grayscale MSE threshold for scene change detection (100-300 typical).","maximum":500.0,"minimum":50.0,"title":"Scene Change Threshold","type":"number"},"enable_region_detection":{"default":true,"description":"Run GroundingDINO to detect logo/brand regions before embedding. When enabled, each frame produces N region embeddings + 1 global fallback. When disabled, falls back to v1 behavior (global embedding only).","title":"Enable Region Detection","type":"boolean"},"detection_prompt":{"default":"logo . brand . trademark . text . icon","description":"GroundingDINO text prompt for zero-shot detection. Dot-separated phrases. Each phrase is a detection category.","title":"Detection Prompt","type":"string"},"box_threshold":{"default":0.25,"description":"Minimum confidence for GroundingDINO bounding box detection.","maximum":0.9,"minimum":0.1,"title":"Box Threshold","type":"number"},"text_threshold":{"default":0.2,"description":"Minimum confidence for GroundingDINO text grounding.","maximum":0.9,"minimum":0.1,"title":"Text Threshold","type":"number"},"max_regions_per_frame":{"default":10,"description":"Maximum detected regions per frame. Prevents explosion on busy frames.","maximum":50,"minimum":1,"title":"Max Regions Per Frame","type":"integer"},"enable_ocr_verification":{"default":false,"description":"Run EasyOCR on detected regions to extract text. Adds ocr_text and ocr_confidence fields to output. Useful for wordmark verification but adds ~50ms per region.","title":"Enable Ocr Verification","type":"boolean"}},"title":"IPFrameExtractorParams","type":"object"},"supported_input_types":["image","video"],"max_inputs":{"image":1,"video":1},"default_parameters":{},"costs":{"tier":2,"tier_label":"STANDARD","rates":[{"unit":"image","credits_per_unit":5,"description":"Cost per frame/image processed with GroundingDINO + SigLIP SO400M"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://ip_frame_extractor@v1/google__siglip_so400m","name":"google__siglip_so400m","description":"SigLIP SO400M image embedding for IP safety brand matching","type":"single","index":{"name":"ip_frame_extractor_v1_embedding","description":"1152-d SigLIP SO400M embedding. Enables zero-shot brand/logo detection when searched against brand text corpus via feature_search.","dimensions":1152,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image"],"inference_name":"google__siglip_so400m_patch14_384","inference_service_id":"google/siglip-so400m-patch14-384","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["frame_index"]},{"feature_extractor_name":"brand_corpus_loader","version":"v1","feature_extractor_id":"brand_corpus_loader_v1","description":"Load pre-computed SigLIP SO400M text embeddings for brand corpus.\n\nDeclares the same vector index as ip_frame_extractor (ip_frame_extractor_v1_embedding, 1152-d) so feature_search queries using image frame embeddings can match brand text embeddings.\n\nNo ML processing — embeddings arrive pre-computed via field_passthrough.","icon":"tag","source":"builtin","input_schema":{"additionalProperties":true,"description":"Input schema. Each object must have brand metadata and embedding via field_passthrough.","properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or path to content (optional — corpus data comes via field_passthrough).","title":"Content"}},"title":"BrandCorpusLoaderInput","type":"object"},"output_schema":{"additionalProperties":true,"description":"Output schema. Brand entry with SigLIP embedding stored as named vector.","properties":{"ip_frame_extractor_v1_embedding":{"description":"Pre-computed SigLIP SO400M text embedding (1152-d).","items":{"type":"number"},"maxItems":1152,"minItems":1152,"title":"Ip Frame Extractor V1 Embedding","type":"array"}},"required":["ip_frame_extractor_v1_embedding"],"title":"BrandCorpusLoaderOutput","type":"object"},"parameter_schema":{"description":"Parameters for brand corpus loader.","examples":[{"description":"Load brand corpus with pre-computed SigLIP embeddings","extractor_type":"brand_corpus_loader"}],"properties":{"extractor_type":{"const":"brand_corpus_loader","default":"brand_corpus_loader","description":"Discriminator field. Must be 'brand_corpus_loader'.","title":"Extractor Type","type":"string"}},"title":"BrandCorpusLoaderParams","type":"object"},"supported_input_types":["text"],"max_inputs":{"text":1},"default_parameters":{},"costs":{"tier":1,"tier_label":"MINIMAL","rates":[{"unit":"extraction","credits_per_unit":1,"description":"Cost per brand entry loaded (no ML processing)"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://brand_corpus_loader@v1/google__siglip_so400m","name":"google__siglip_so400m","description":"SigLIP SO400M embedding for brand text matching (same space as ip_frame_extractor)","type":"single","index":{"name":"ip_frame_extractor_v1_embedding","description":"1152-d SigLIP SO400M embedding. Pre-computed from brand names. Searchable via feature_search using ip_frame_extractor image queries.","dimensions":1152,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text"],"inference_name":"google__siglip_so400m_patch14_384","inference_service_id":"google/siglip-so400m-patch14-384","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":[]},{"feature_extractor_name":"face_corpus_loader","version":"v1","feature_extractor_id":"face_corpus_loader_v1","description":"Load pre-computed ArcFace embeddings for face identity corpus.\n\nDeclares the same vector index as face_identity_extractor (face_identity_extractor_v1_embedding, 512-d) so feature_search queries using face embeddings from video can match known identities.\n\nNo ML processing — embeddings arrive pre-computed via field_passthrough.","icon":"users","source":"builtin","input_schema":{"additionalProperties":true,"description":"Input schema. Each object has face metadata and embedding via field_passthrough.","properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or path to content (optional — corpus data comes via field_passthrough).","title":"Content"}},"title":"FaceCorpusLoaderInput","type":"object"},"output_schema":{"additionalProperties":true,"description":"Output schema. Face entry with ArcFace embedding stored as named vector.","properties":{"face_identity_extractor_v1_embedding":{"description":"Pre-computed ArcFace embedding (512-d).","items":{"type":"number"},"maxItems":512,"minItems":512,"title":"Face Identity Extractor V1 Embedding","type":"array"}},"required":["face_identity_extractor_v1_embedding"],"title":"FaceCorpusLoaderOutput","type":"object"},"parameter_schema":{"description":"Parameters for face corpus loader.","examples":[{"description":"Load face corpus with pre-computed ArcFace embeddings","extractor_type":"face_corpus_loader"}],"properties":{"extractor_type":{"const":"face_corpus_loader","default":"face_corpus_loader","description":"Discriminator field. Must be 'face_corpus_loader'.","title":"Extractor Type","type":"string"}},"title":"FaceCorpusLoaderParams","type":"object"},"supported_input_types":["text"],"max_inputs":{"text":1},"default_parameters":{},"costs":{"tier":1,"tier_label":"MINIMAL","rates":[{"unit":"extraction","credits_per_unit":1,"description":"Cost per face entry loaded (no ML processing)"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://face_corpus_loader@v1/insightface__arcface_r100","name":"insightface__arcface_r100","description":"ArcFace R100 embedding for face identity matching (same space as face_identity_extractor)","type":"single","index":{"name":"face_identity_extractor_v1_embedding","description":"512-d ArcFace embedding. Pre-computed from face images. Searchable via feature_search using face_identity_extractor queries.","dimensions":512,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image"],"inference_name":"insightface__buffalo_l","inference_service_id":"insightface/buffalo_l","purpose":null,"vector_name_override":null}}],"required_payload_indexes":[],"position_fields":[]},{"feature_extractor_name":"sentiment_classifier","version":"v1","feature_extractor_id":"sentiment_classifier_v1","description":"Text sentiment classification using DistilBERT fine-tuned on SST-2. Classifies text as positive, negative, or neutral with confidence scores. Optionally generates E5 embeddings for semantic retrieval. Useful for content moderation, feedback analysis, and social media monitoring.","icon":"heart","source":"builtin","input_schema":{"description":"Input schema for sentiment classifier.","properties":{"text":{"description":"Text content to classify","title":"Text","type":"string"}},"required":["text"],"title":"SentimentClassifierInput","type":"object"},"output_schema":{"$defs":{"SentimentLabel":{"description":"Individual sentiment label with confidence.","properties":{"label":{"description":"Sentiment label (positive, negative, neutral)","title":"Label","type":"string"},"score":{"description":"Confidence score","maximum":1.0,"minimum":0.0,"title":"Score","type":"number"}},"required":["label","score"],"title":"SentimentLabel","type":"object"}},"description":"Output schema for sentiment classifier.","properties":{"sentiment":{"description":"Primary sentiment classification (positive, negative, neutral)","title":"Sentiment","type":"string"},"confidence":{"description":"Classification confidence","maximum":1.0,"minimum":0.0,"title":"Confidence","type":"number"},"all_scores":{"description":"Scores for all sentiment classes","items":{"$ref":"#/$defs/SentimentLabel"},"title":"All Scores","type":"array"},"text_length":{"description":"Length of input text in characters","title":"Text Length","type":"integer"},"text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Original text content (preserved for retrieval)","title":"Text"},"sentiment_classifier_v1_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"E5 text embedding for semantic retrieval (1024-dimensional)","title":"Sentiment Classifier V1 Embedding"}},"required":["sentiment","confidence","text_length"],"title":"SentimentClassifierOutput","type":"object"},"parameter_schema":{"description":"Parameters for sentiment classifier.","properties":{"extractor_type":{"const":"sentiment_classifier","default":"sentiment_classifier","description":"Discriminator field for parameter type identification.","title":"Extractor Type","type":"string"},"model_name":{"default":"distilbert-base-uncased-finetuned-sst-2-english","description":"HuggingFace model name for sentiment classification","title":"Model Name","type":"string"},"max_length":{"default":512,"description":"Maximum token length","maximum":1024,"minimum":32,"title":"Max Length","type":"integer"},"batch_size":{"default":32,"description":"Inference batch size","maximum":128,"minimum":1,"title":"Batch Size","type":"integer"},"return_all_scores":{"default":true,"description":"Return scores for all classes, not just top","title":"Return All Scores","type":"boolean"},"embed":{"default":false,"description":"Generate E5 embeddings for semantic retrieval alongside classification. Uses the internal E5 embedding service for 1024-dimensional vectors.","title":"Embed","type":"boolean"}},"title":"SentimentClassifierParams","type":"object"},"supported_input_types":["text","string"],"max_inputs":{"text":1},"default_parameters":{},"costs":null,"required_vector_indexes":null,"required_payload_indexes":null,"position_fields":[]},{"feature_extractor_name":"web_scraper","version":"v1","feature_extractor_id":"web_scraper_v1","description":"Crawls websites and extracts content with multimodal embeddings. Supports documentation sites, job boards, news sites, and SPAs.\n\n**Embedding Types:**\n- Text (E5-Large 1024D): Semantic search over page content\n- Code (Jina Code 768D): Code similarity and API pattern matching\n- Images (SigLIP 768D): Semantic visual search (what is shown)\n- Images (DINOv2 768D): Visual structure comparison (how it looks)\n\n**Use for:** Documentation freshness detection, knowledge base building, job board ingestion, API example indexing, curriculum validation.","icon":"globe","source":"builtin","input_schema":{"description":"Input schema for the web scraper extractor.\n\nAccepts a URL to crawl. The extractor will recursively follow links\nand extract content from all discovered pages.","examples":[{"description":"AWS Boto3 documentation","url":"https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html"},{"description":"Job board","url":"https://boards.greenhouse.io/anthropic"}],"properties":{"url":{"description":"REQUIRED. Seed URL to start crawling from. Example: 'https://docs.example.com/api/'","examples":["https://boto3.amazonaws.com/v1/documentation/api/latest/","https://boards.greenhouse.io/anthropic","https://github.com/anthropics/anthropic-sdk-python"],"title":"Url","type":"string"}},"required":["url"],"title":"WebScraperExtractorInput","type":"object"},"output_schema":{"$defs":{"AssetLink":{"description":"A downloadable asset link discovered during crawling.\n\nDESIGN RATIONALE:\n----------------\nDuring web crawling, we encounter links to downloadable files (PDFs, documents,\narchives, etc.) that cannot be directly embedded as text. Rather than:\n1. Following these links and storing unusable binary data, OR\n2. Silently ignoring them via exclude_patterns\n\nWe capture them in a structured array. This enables downstream processing:\n- A separate PDF extractor collection can process these assets\n- Analytics on what documentation assets exist\n- Completeness tracking for documentation coverage\n\nUSAGE:\n------\nAsset links are captured during HTML crawling but NOT followed. They are stored\nas metadata on the parent page document. A downstream pipeline can then:\n1. Query pages with asset_links\n2. Send asset URLs to a dedicated document processing collection\n3. Link the extracted content back to the source page\n\nExample downstream workflow:\n    Page A (HTML) -> asset_links: [{url: \"guide.pdf\", ...}]\n                          |\n                          v\n    PDF Collection (with pdf_extractor) -> processes guide.pdf\n                          |\n                          v\n    Linked documents with parent_url reference","properties":{"url":{"description":"Full URL of the downloadable asset","title":"Url","type":"string"},"file_type":{"description":"Asset type detected from extension/content-type (pdf, doc, zip, etc.)","title":"File Type","type":"string"},"link_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Anchor text of the link (provides context about the asset)","title":"Link Text"},"link_title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Title attribute of the link element","title":"Link Title"},"file_extension":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"File extension extracted from URL (e.g., '.pdf', '.docx')","title":"File Extension"}},"required":["url","file_type"],"title":"AssetLink","type":"object"},"CodeBlock":{"description":"A code block extracted from a web page.","properties":{"language":{"description":"Programming language (python, javascript, etc.)","title":"Language","type":"string"},"code":{"description":"The code content","title":"Code","type":"string"},"line_start":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Starting line in source","title":"Line Start"},"line_end":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Ending line in source","title":"Line End"}},"required":["language","code"],"title":"CodeBlock","type":"object"},"ExtractedImage":{"description":"An image extracted from a web page.","properties":{"src":{"description":"Image source URL","title":"Src","type":"string"},"alt":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Alt text","title":"Alt"},"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Title attribute","title":"Title"},"width":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Image width in pixels","title":"Width"},"height":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Image height in pixels","title":"Height"}},"required":["src"],"title":"ExtractedImage","type":"object"}},"description":"Output schema for a single document produced by the web scraper.\n\nEach crawled page (or chunk) produces one document with:\n- Text content + E5 embedding (1024D)\n- Code blocks + Jina Code embeddings (768D each)\n- Images + SigLIP embeddings (768D each)\n- Page metadata (URL, title, depth, etc.)","examples":[{"code_blocks":[{"code":"import boto3\ns3 = boto3.client('s3')","language":"python"}],"content":"Getting Started with S3...","description":"Documentation page with code","intfloat__multilingual_e5_large_instruct":[0.01,-0.02],"jinaai__jina_embeddings_v2_base_code":[0.03,-0.04],"page_url":"https://boto3.amazonaws.com/.../quickstart.html","title":"Quickstart - Boto3 Docs"}],"properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Extracted text content from the page or chunk.","title":"Content"},"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Page title extracted from HTML.","title":"Title"},"page_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL of the source page.","title":"Page Url"},"code_blocks":{"anyOf":[{"items":{"$ref":"#/$defs/CodeBlock"},"type":"array"},{"type":"null"}],"default":null,"description":"Code blocks extracted from the page.","title":"Code Blocks"},"images":{"anyOf":[{"items":{"$ref":"#/$defs/ExtractedImage"},"type":"array"},{"type":"null"}],"default":null,"description":"Images extracted from the page.","title":"Images"},"asset_links":{"anyOf":[{"items":{"$ref":"#/$defs/AssetLink"},"type":"array"},{"type":"null"}],"default":null,"description":"Downloadable assets discovered on this page (PDFs, docs, archives). These links are captured for downstream processing by specialized extractors (e.g., PDF collection) but are NOT followed during crawling. Use this to build complete documentation coverage including non-HTML assets.","title":"Asset Links"},"intfloat__multilingual_e5_large_instruct":{"anyOf":[{"items":{"type":"number"},"maxItems":1024,"minItems":1024,"type":"array"},{"type":"null"}],"default":null,"description":"E5 embedding for text content (1024D). Derived from intfloat/multilingual-e5-large-instruct.","title":"Intfloat  Multilingual E5 Large Instruct"},"jinaai__jina_embeddings_v2_base_code":{"anyOf":[{"items":{"type":"number"},"maxItems":768,"minItems":768,"type":"array"},{"type":"null"}],"default":null,"description":"Jina code embedding for code blocks (768D). Derived from jinaai/jina-embeddings-v2-base-code.","title":"Jinaai  Jina Embeddings V2 Base Code"},"google__siglip_base_patch16_224":{"anyOf":[{"items":{"type":"number"},"maxItems":768,"minItems":768,"type":"array"},{"type":"null"}],"default":null,"description":"SigLIP embedding for images (768D). Derived from google/siglip-base-patch16-224.","title":"Google  Siglip Base Patch16 224"},"facebook__dinov2_base":{"anyOf":[{"items":{"type":"number"},"maxItems":768,"minItems":768,"type":"array"},{"type":"null"}],"default":null,"description":"DINOv2 visual structure embedding (768D). Derived from facebook/dinov2-base.","title":"Facebook  Dinov2 Base"},"chunk_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Index of this chunk within the page.","title":"Chunk Index"},"total_chunks":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Total chunks from this page.","title":"Total Chunks"},"crawl_depth":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Depth from seed URL (0=seed page).","title":"Crawl Depth"},"parent_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL of the page that linked to this one.","title":"Parent Url"}},"title":"WebScraperExtractorOutput","type":"object"},"parameter_schema":{"$defs":{"ChunkStrategy":{"description":"Strategy for splitting page content into chunks.","enum":["none","sentences","paragraphs","words","characters"],"title":"ChunkStrategy","type":"string"},"CrawlMode":{"description":"Mode for crawling web pages.\n\nValues:\n    DETERMINISTIC: BFS crawl following all links up to max_depth\n    SEMANTIC: LLM-guided crawl prioritizing pages relevant to crawl_goal","enum":["deterministic","semantic"],"title":"CrawlMode","type":"string"},"DocumentIdStrategy":{"description":"Strategy for generating deterministic document IDs.\n\nValues:\n    URL: hash(page_url + chunk_index) - stable across re-crawls\n    POSITION: hash(seed_url + page_index + chunk_index) - order-based\n    CONTENT: hash(content) - deduplicates identical content","enum":["url","position","content"],"title":"DocumentIdStrategy","type":"string"},"RenderStrategy":{"description":"Strategy for rendering web pages.\n\nValues:\n    STATIC: Fast HTTP fetch, works for most sites\n    JAVASCRIPT: Browser rendering via Playwright for SPAs\n    AUTO: Try static first, fall back to JS if content too short","enum":["static","javascript","auto"],"title":"RenderStrategy","type":"string"}},"description":"Parameters for the web scraper extractor.\n\nThe web scraper extractor crawls websites and extracts content with three types\nof embeddings for comprehensive multimodal search:\n\n**Embedding Types:**\n- Text (E5-Large): 1024D embeddings for page content\n- Code (Jina Code): 768D embeddings for code blocks\n- Images (SigLIP): 768D semantic embeddings for figures/screenshots\n- Images (DINOv2): 768D structure embeddings for visual layout comparison\n\n**Crawl Modes:**\n- DETERMINISTIC: BFS following all links (default, predictable)\n- SEMANTIC: LLM-guided, prioritizes pages matching crawl_goal\n\n**Rendering Strategies:**\n- STATIC: Fast HTTP fetch (default, works for most sites)\n- JAVASCRIPT: Playwright browser for SPAs (React/Vue/Angular)\n- AUTO: Tries static, falls back to JS if content too short\n\n**Use Cases:**\n- Documentation freshness: Crawl docs, compare against course content\n- Job board ingestion: Extract job listings with structured data\n- Knowledge base building: Convert websites to searchable collections\n- Code example indexing: Find API usage patterns across docs","examples":[{"chunk_size":3,"chunk_strategy":"paragraphs","description":"Documentation site crawl","extractor_type":"web_scraper","max_depth":3,"max_pages":100},{"description":"Job board extraction","extractor_type":"web_scraper","max_depth":1,"max_pages":50,"render_strategy":"auto","response_shape":"Extract job title, department, location, and requirements"},{"crawl_goal":"Find all S3 upload examples and API documentation","crawl_mode":"semantic","description":"Semantic crawl for API docs","extractor_type":"web_scraper","generate_code_embeddings":true,"max_pages":200},{"delay_between_requests":0.5,"description":"Large-scale catalogue with resilience","extractor_type":"web_scraper","max_depth":5,"max_pages":10000,"max_retries":5,"respect_retry_after":true},{"description":"Protected site with proxy rotation","extractor_type":"web_scraper","max_pages":5000,"persist_cookies":true,"proxies":["http://proxy1.example.com:8080","http://proxy2.example.com:8080"],"rotate_proxy_every_n_requests":50,"rotate_proxy_on_error":true}],"properties":{"extractor_type":{"const":"web_scraper","default":"web_scraper","description":"Discriminator field for parameter type identification.","title":"Extractor Type","type":"string"},"max_depth":{"default":2,"description":"Maximum link depth to crawl. 0=seed page only, 1=seed+direct links, etc. Default: 2. Max: 10.","maximum":10,"minimum":0,"title":"Max Depth","type":"integer"},"max_pages":{"default":50,"description":"Maximum pages to crawl. Default: 50. Max: 500.","maximum":500,"minimum":1,"title":"Max Pages","type":"integer"},"crawl_timeout":{"default":300,"description":"Maximum total time for crawling in seconds. Default: 300 (5 minutes). Increase for large sites with many pages. Max: 3600 (1 hour).","maximum":3600,"minimum":10,"title":"Crawl Timeout","type":"integer"},"crawl_mode":{"$ref":"#/$defs/CrawlMode","default":"deterministic","description":"Crawl strategy. DETERMINISTIC: BFS all links (predictable). SEMANTIC: LLM-guided, prioritizes relevant pages (requires crawl_goal)."},"crawl_goal":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Goal for semantic crawling. Only used when crawl_mode=SEMANTIC. Example: 'Find all S3 API documentation and examples'","title":"Crawl Goal"},"render_strategy":{"$ref":"#/$defs/RenderStrategy","default":"auto","description":"How to render pages. AUTO (default): tries static, falls back to JS. STATIC: fast HTTP fetch. JAVASCRIPT: Playwright browser for SPAs."},"include_patterns":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Regex patterns for URLs to include. Example: ['/docs/', '/api/']","title":"Include Patterns"},"exclude_patterns":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Regex patterns for URLs to exclude. Example: ['/blog/', '\\.pdf$']","title":"Exclude Patterns"},"chunk_strategy":{"$ref":"#/$defs/ChunkStrategy","default":"none","description":"How to split page content. NONE: one chunk per page. SENTENCES/PARAGRAPHS: semantic boundaries. WORDS/CHARACTERS: fixed size chunks."},"chunk_size":{"default":500,"description":"Target size for each chunk (in units of chunk_strategy).","maximum":10000,"minimum":1,"title":"Chunk Size","type":"integer"},"chunk_overlap":{"default":50,"description":"Overlap between chunks to preserve context.","maximum":5000,"minimum":0,"title":"Chunk Overlap","type":"integer"},"document_id_strategy":{"$ref":"#/$defs/DocumentIdStrategy","default":"url","description":"How to generate document IDs. URL (default): stable across re-crawls. POSITION: order-based. CONTENT: deduplicates identical content."},"generate_text_embeddings":{"default":true,"description":"Generate E5 embeddings for text content.","title":"Generate Text Embeddings","type":"boolean"},"generate_code_embeddings":{"default":true,"description":"Generate Jina code embeddings for code blocks.","title":"Generate Code Embeddings","type":"boolean"},"generate_image_embeddings":{"default":true,"description":"Generate SigLIP embeddings for images/figures.","title":"Generate Image Embeddings","type":"boolean"},"generate_structure_embeddings":{"default":true,"description":"Generate DINOv2 visual structure embeddings for layout comparison.","title":"Generate Structure Embeddings","type":"boolean"},"response_shape":{"anyOf":[{"type":"string"},{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Optional structured extraction schema. Natural language or JSON schema. Example: 'Extract API version, deprecated methods, and example code'","title":"Response Shape"},"llm_provider":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"LLM provider for structured extraction: openai, google, anthropic","title":"Llm Provider"},"llm_model":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"LLM model for structured extraction.","title":"Llm Model"},"llm_api_key":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"API key for LLM operations (BYOK - Bring Your Own Key). Supports:\n- Direct key: 'sk-proj-abc123...'\n- Secret reference: '{{SECRET.openai_api_key}}'\n\nWhen using secret reference, the key is loaded from your organization's secrets vault at runtime. Store secrets via POST /v1/organizations/secrets.\n\nIf not provided, uses Mixpeek's default API keys.","title":"Llm Api Key"},"max_retries":{"default":3,"description":"Maximum retry attempts for failed HTTP requests. Uses exponential backoff with jitter. Default: 3.","maximum":10,"minimum":0,"title":"Max Retries","type":"integer"},"retry_base_delay":{"default":1.0,"description":"Base delay in seconds for retry backoff. Actual delay = base * 2^attempt + jitter. Default: 1.0.","maximum":30.0,"minimum":0.1,"title":"Retry Base Delay","type":"number"},"retry_max_delay":{"default":30.0,"description":"Maximum delay in seconds between retries. Default: 30.","maximum":300.0,"minimum":1.0,"title":"Retry Max Delay","type":"number"},"respect_retry_after":{"default":true,"description":"Respect Retry-After header from 429/503 responses. If False, uses exponential backoff instead. Default: True.","title":"Respect Retry After","type":"boolean"},"proxies":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"List of proxy URLs for rotation. Supports formats: 'http://host:port', 'http://user:pass@host:port', 'socks5://host:port'. Proxies rotate on errors or every N requests.","title":"Proxies"},"rotate_proxy_on_error":{"default":true,"description":"Rotate to next proxy when request fails. Default: True.","title":"Rotate Proxy On Error","type":"boolean"},"rotate_proxy_every_n_requests":{"default":0,"description":"Rotate proxy every N requests (0 = disabled). Useful for avoiding IP-based rate limits. Default: 0 (disabled).","maximum":1000,"minimum":0,"title":"Rotate Proxy Every N Requests","type":"integer"},"captcha_service_provider":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Captcha solving service provider: '2captcha', 'anti-captcha', 'capsolver'. If not set, captcha pages are skipped gracefully.","title":"Captcha Service Provider"},"captcha_service_api_key":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"API key for captcha solving service. Supports secret reference: '{{SECRET.captcha_api_key}}'. Required if captcha_service_provider is set.","title":"Captcha Service Api Key"},"detect_captcha":{"default":true,"description":"Detect captcha challenges (Cloudflare, reCAPTCHA, hCaptcha). If detected and no solver configured, page is skipped. Default: True.","title":"Detect Captcha","type":"boolean"},"persist_cookies":{"default":true,"description":"Persist cookies across requests within a crawl session. Useful for sites requiring authentication. Default: True.","title":"Persist Cookies","type":"boolean"},"custom_headers":{"anyOf":[{"additionalProperties":{"type":"string"},"type":"object"},{"type":"null"}],"default":null,"description":"Custom HTTP headers to include in all requests. Example: {'Authorization': 'Bearer token', 'X-Custom': 'value'}","title":"Custom Headers"},"delay_between_requests":{"default":0.0,"description":"Delay in seconds between consecutive requests. Useful for polite crawling and avoiding rate limits. Default: 0 (no delay).","maximum":60.0,"minimum":0.0,"title":"Delay Between Requests","type":"number"}},"title":"WebScraperExtractorParams","type":"object"},"supported_input_types":["string","text"],"max_inputs":{"string":1},"default_parameters":{},"costs":{"tier":3,"tier_label":"COMPLEX","rates":[{"unit":"page","credits_per_unit":5,"description":"Web page crawl and text extraction"},{"unit":"extraction","credits_per_unit":1,"description":"Code block embedding with Jina Code"},{"unit":"image","credits_per_unit":2,"description":"Image embedding with SigLIP"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://web_scraper@v1/intfloat__multilingual_e5_large_instruct","name":"intfloat__multilingual_e5_large_instruct","description":"Vector index for text content embeddings.","type":"single","index":{"name":"intfloat__multilingual_e5_large_instruct","description":"E5 embedding for text content.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["string","text"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":"text","vector_name_override":null}},{"feature_uri":"mixpeek://web_scraper@v1/jinaai__jina_embeddings_v2_base_code","name":"jinaai__jina_embeddings_v2_base_code","description":"Vector index for code block embeddings.","type":"single","index":{"name":"jinaai__jina_embeddings_v2_base_code","description":"Jina code embedding for code blocks.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["string","text"],"inference_name":"jinaai__jina_embeddings_v2_base_code","inference_service_id":"jinaai/jina-embeddings-v2-base-code","purpose":"code","vector_name_override":null}},{"feature_uri":"mixpeek://web_scraper@v1/google__siglip_base_patch16_224","name":"google__siglip_base_patch16_224","description":"Vector index for semantic image embeddings.","type":"single","index":{"name":"google__siglip_base_patch16_224","description":"SigLIP embedding for semantic visual content.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image"],"inference_name":"google__siglip_base_patch16_224","inference_service_id":"google/siglip-base-patch16-224","purpose":"image","vector_name_override":null}},{"feature_uri":"mixpeek://web_scraper@v1/facebook__dinov2_base","name":"facebook__dinov2_base","description":"Vector index for visual structure embeddings.","type":"single","index":{"name":"facebook__dinov2_base","description":"DINOv2 embedding for fine-grained visual structure comparison.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image"],"inference_name":"facebook__dinov2_base","inference_service_id":"facebook/dinov2-base","purpose":"image","vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["page_url","doc_type","code_index","image_index","chunk_index"]},{"feature_extractor_name":"course_content_extractor","version":"v1","feature_extractor_id":"course_content_extractor_v1","description":"**Educational content extractor** for VIDEO lectures, PDF slides, and CODE archives.\n\nDecomposes educational materials into atomic learning units optimized for retrieval:\n- **Video**: Transcript-based segmentation with SRT support, OCR for screen text\n- **PDF**: Layout-aware extraction of text, figures, tables, and code blocks\n- **Code**: Function-level segmentation with language detection\n\n**Multi-modal Embeddings:**\n- E5 (1024D): Transcripts, slide text, descriptions\n- Jina Code (768D): Code snippets and functions\n- SigLIP (768D): Semantic visual content (what is shown)\n- DINOv2 (768D): Visual structure/layout comparison (how it looks)\n\n**Use for:** Online courses, lecture archives, code tutorials, technical training.","icon":"graduation-cap","source":"builtin","input_schema":{"description":"Input schema for course content extractor.","properties":{"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to video file.","title":"Video"},"srt":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to SRT subtitle file.","title":"Srt"},"pdf":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to PDF document.","title":"Pdf"},"code_archive":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to ZIP archive containing source code.","title":"Code Archive"}},"title":"CourseContentExtractorInput","type":"object"},"output_schema":{"description":"Output schema for a single atomic learning unit.","properties":{"unit_type":{"description":"Type of learning unit.","title":"Unit Type","type":"string"},"doc_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Granular document type when expand_to_granular_docs=True.","title":"Doc Type"},"parent_segment_id":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"ID of parent video segment (for granular docs).","title":"Parent Segment Id"},"segment_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Index of this segment within the video.","title":"Segment Index"},"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Generated or extracted title for this unit.","title":"Title"},"start_time":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Start time in seconds.","title":"Start Time"},"end_time":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"End time in seconds.","title":"End Time"},"page_number":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Page number (1-indexed).","title":"Page Number"},"element_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Element index within page.","title":"Element Index"},"start_line":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Start line number.","title":"Start Line"},"end_line":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"End line number.","title":"End Line"},"text_content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Primary text content.","title":"Text Content"},"screen_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"OCR-extracted text.","title":"Screen Text"},"code_content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Code snippet or function.","title":"Code Content"},"code_language":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Detected programming language.","title":"Code Language"},"element_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"PDF element type.","title":"Element Type"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of thumbnail.","title":"Thumbnail Url"},"intfloat__multilingual_e5_large_instruct":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"E5 text embedding (1024D).","title":"Intfloat  Multilingual E5 Large Instruct"},"jinaai__jina_embeddings_v2_base_code":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Jina code embedding (768D).","title":"Jinaai  Jina Embeddings V2 Base Code"},"google__siglip_base_patch16_224":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"SigLIP visual embedding (768D).","title":"Google  Siglip Base Patch16 224"},"facebook__dinov2_base":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"DINOv2 visual structure embedding (768D).","title":"Facebook  Dinov2 Base"},"vlm_frame_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Frame classification: product_ui, code, presenter, slide, generic_console, diagram.","title":"Vlm Frame Type"},"vlm_page_context":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Specific product page/section shown (e.g., 'Guardrails - Add Denied Topics').","title":"Vlm Page Context"},"vlm_ui_labels":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Interactive UI element labels extracted from main content area.","title":"Vlm Ui Labels"},"vlm_workflow_steps":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Ordered workflow steps if a multi-step process is shown.","title":"Vlm Workflow Steps"},"vlm_config_options":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Configuration options/settings visible in the frame.","title":"Vlm Config Options"},"source_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL of source file.","title":"Source Url"},"llm_summary":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"LLM-generated summary.","title":"Llm Summary"}},"required":["unit_type"],"title":"CourseContentExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for the course content extractor.","examples":[{"description":"Standard video processing with all embeddings","extractor_type":"course_content_extractor","run_code_embedding":true,"run_structure_embedding":true,"run_text_embedding":true,"run_visual_embedding":true,"target_segment_duration_ms":120000}],"properties":{"extractor_type":{"const":"course_content_extractor","default":"course_content_extractor","description":"Discriminator field. Must be 'course_content_extractor'.","title":"Extractor Type","type":"string"},"target_segment_duration_ms":{"default":120000,"description":"Target duration for video segments in milliseconds.","maximum":600000,"minimum":30000,"title":"Target Segment Duration Ms","type":"integer"},"min_segment_duration_ms":{"default":30000,"description":"Minimum duration for video segments in milliseconds.","minimum":10000,"title":"Min Segment Duration Ms","type":"integer"},"segmentation_method":{"default":"scene","description":"Video segmentation method: 'scene', 'srt', or 'time'.","enum":["scene","srt","time"],"title":"Segmentation Method","type":"string"},"scene_detection_threshold":{"default":0.3,"description":"Scene detection sensitivity (0.0-1.0).","maximum":0.9,"minimum":0.1,"title":"Scene Detection Threshold","type":"number"},"use_whisper_asr":{"default":true,"description":"Use Whisper ASR for transcription instead of SRT subtitles.","title":"Use Whisper Asr","type":"boolean"},"expand_to_granular_docs":{"default":true,"description":"Expand each segment into multiple granular documents.","title":"Expand To Granular Docs","type":"boolean"},"ocr_frames_per_segment":{"default":3,"description":"Number of frames to OCR per video segment.","maximum":10,"minimum":1,"title":"Ocr Frames Per Segment","type":"integer"},"pdf_extraction_mode":{"default":"per_element","description":"How to extract PDF content: 'per_page' or 'per_element'.","enum":["per_page","per_element"],"title":"Pdf Extraction Mode","type":"string"},"pdf_render_dpi":{"default":150,"description":"DPI for rendering PDF pages/elements as images.","maximum":300,"minimum":72,"title":"Pdf Render Dpi","type":"integer"},"detect_code_in_pdf":{"default":true,"description":"Whether to detect code blocks in PDF text.","title":"Detect Code In Pdf","type":"boolean"},"segment_functions":{"default":true,"description":"Whether to segment code files into individual functions.","title":"Segment Functions","type":"boolean"},"supported_languages":{"description":"Programming languages to extract from code archives.","items":{"type":"string"},"title":"Supported Languages","type":"array"},"run_text_embedding":{"default":true,"description":"Generate E5 text embeddings (1024D) for transcripts and text.","title":"Run Text Embedding","type":"boolean"},"run_code_embedding":{"default":true,"description":"Generate Jina Code embeddings (768D) for code snippets.","title":"Run Code Embedding","type":"boolean"},"run_visual_embedding":{"default":true,"description":"Generate SigLIP visual embeddings (768D) for video frames.","title":"Run Visual Embedding","type":"boolean"},"run_structure_embedding":{"default":true,"description":"Generate DINOv2 visual structure embeddings (768D) for layout comparison.","title":"Run Structure Embedding","type":"boolean"},"visual_embedding_use_case":{"default":"lecture","description":"Content type preset for visual embedding strategy.","enum":["lecture","code_demo","tutorial","presentation","dynamic"],"title":"Visual Embedding Use Case","type":"string"},"extract_screen_text":{"default":true,"description":"Run OCR on video frames to extract on-screen text.","title":"Extract Screen Text","type":"boolean"},"generate_thumbnails":{"default":true,"description":"Generate thumbnail images for each learning unit.","title":"Generate Thumbnails","type":"boolean"},"use_cdn":{"default":false,"description":"Use CDN for thumbnail delivery.","title":"Use Cdn","type":"boolean"},"run_vlm_frame_analysis":{"default":false,"description":"Run VLM on video frame thumbnails to extract structured fields: frame_type, page_context, ui_labels, workflow_steps, config_options. Enables drift detection and UI comparison use cases.","title":"Run Vlm Frame Analysis","type":"boolean"},"vlm_provider":{"default":"google","description":"VLM provider: 'google' (Gemini API) or 'vllm' (local GPU with Qwen2.5-VL).","enum":["google","vllm"],"title":"Vlm Provider","type":"string"},"vlm_model":{"default":"gemini-2.5-flash","description":"VLM model. For google: 'gemini-2.5-flash'. For vllm: 'Qwen/Qwen2.5-VL-7B-Instruct'.","title":"Vlm Model","type":"string"},"enrich_with_llm":{"default":false,"description":"Use Gemini to generate summaries and enhance descriptions.","title":"Enrich With Llm","type":"boolean"},"llm_prompt":{"default":"Summarize this educational content segment, highlighting key concepts.","description":"Prompt for LLM enrichment when enrich_with_llm=True.","title":"Llm Prompt","type":"string"}},"title":"CourseContentExtractorParams","type":"object"},"supported_input_types":["video","pdf","text"],"max_inputs":{"video":1,"pdf":1,"text":1},"default_parameters":{},"costs":{"tier":2,"tier_label":"MODERATE","rates":[{"unit":"minute","credits_per_unit":20,"description":"Video lecture segmentation per minute"},{"unit":"page","credits_per_unit":5,"description":"PDF slide/document page processing"},{"unit":"1k_tokens","credits_per_unit":2,"description":"Text and code embedding per 1K tokens"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://course_content_extractor@v1/intfloat__multilingual_e5_large_instruct","name":"intfloat__multilingual_e5_large_instruct","description":"Vector index for text content embeddings.","type":"single","index":{"name":"intfloat__multilingual_e5_large_instruct","description":"E5 text embedding for transcripts, slide text, descriptions.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text","string"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":"text","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/jinaai__jina_embeddings_v2_base_code","name":"jinaai__jina_embeddings_v2_base_code","description":"Vector index for code embeddings.","type":"single","index":{"name":"jinaai__jina_embeddings_v2_base_code","description":"Jina code embedding for code snippets and functions.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text","string"],"inference_name":"jinaai__jina_embeddings_v2_base_code","inference_service_id":"jinaai/jina-embeddings-v2-base-code","purpose":"code","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/google__siglip_base_patch16_224","name":"google__siglip_base_patch16_224","description":"Vector index for visual content embeddings.","type":"single","index":{"name":"google__siglip_base_patch16_224","description":"SigLIP visual embedding with multi-frame aggregation.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image","video"],"inference_name":"google__siglip_base_patch16_224","inference_service_id":"google/siglip-base-patch16-224","purpose":"image","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/facebook__dinov2_base","name":"facebook__dinov2_base","description":"Vector index for visual structure embeddings.","type":"single","index":{"name":"facebook__dinov2_base","description":"DINOv2 visual structure embedding for fine-grained layout comparison.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image","video"],"inference_name":"facebook__dinov2_base","inference_service_id":"facebook/dinov2-base","purpose":"image","vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["start_time","end_time","page_number","element_index","doc_type"]}]