{"feature_extractor_name":"multimodal_extractor","version":"v2","feature_extractor_id":"multimodal_extractor_v2","description":"**Multimodal extractor v2** using **Gemini Embedding 2** (3072D) for unified embeddings.\n\nSame pipeline as v1 (FFmpeg chunking, Whisper transcription, thumbnails, Gemini vision) but with upgraded embedding model:\n- **v1**: Vertex Multimodal Embedding (1408D)\n- **v2**: Gemini Embedding 2 (3072D, configurable: 1536/768)\n\nGemini Embedding 2 is Google's first natively multimodal embedding model, mapping text, images, video (up to 120s), audio, and PDFs into a unified space.\n\n**Pipeline Steps:**\n1. FFmpeg chunking (time/scene/silence)\n2. Whisper transcription (optional)\n3. E5 transcription embeddings (optional, 1024D)\n4. **Gemini Embedding 2** multimodal embeddings (3072D)\n5. Thumbnail generation (optional)\n6. Gemini visual description/OCR (optional)\n\n**Use for:** Unified multimodal search with higher-dimensional embeddings and native multimodal understanding.","icon":"film","category":"multimodal","source":"builtin","type_mode":"multimodal","expected_input_types":null,"inference_type":"embedding","input_schema":{"description":"Input schema for multimodal extractor v2.\n\nSame inputs as v1 — video, image, text, or gif.\nEmbeddings use Gemini Embedding 2 (3072D) instead of Vertex (1408D).","oneOf":[{"required":["video"]},{"required":["image"]},{"required":["text"]},{"required":["gif"]}],"properties":{"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to video file. Decomposed into segments.","title":"Video"},"image":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to image file. Embedded directly.","title":"Image"},"text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Plain text content. Embedded directly.","title":"Text"},"gif":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to GIF file. Treated as video.","title":"Gif"},"custom_thumbnail":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Optional custom thumbnail URL.","title":"Custom Thumbnail"}},"title":"MultimodalExtractorInput","type":"object"},"output_schema":{"description":"Output schema for multimodal extractor v2.","properties":{"start_time":{"description":"Start time of the segment in seconds","title":"Start Time","type":"number"},"end_time":{"description":"End time of the segment in seconds","title":"End Time","type":"number"},"start_frame":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Start frame number of the segment (start_time * fps)","title":"Start Frame"},"end_frame":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"End frame number of the segment (end_time * fps)","title":"End Frame"},"fps":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Frame rate of the video at the time of splitting","title":"Fps"},"source_fps":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Original source video frame rate before preprocessing (e.g. 29.97, 30, 23.976)","title":"Source Fps"},"duration":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Total source video duration in seconds","title":"Duration"},"transcription":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Transcription of audio","title":"Transcription"},"description":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Generated segment description","title":"Description"},"ocr_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"OCR text from video frames","title":"Ocr Text"},"json_output":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Raw JSON from underlying models","title":"Json Output"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Thumbnail image URL","title":"Thumbnail Url"},"source_video_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Original source video URL","title":"Source Video Url"},"video_segment_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Video segment URL","title":"Video Segment Url"},"multimodal_extractor_v2_multimodal_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embeddings (3072D) via Gemini Embedding 2 for multimodal content.","title":"Multimodal Extractor V2 Multimodal Embedding"},"multimodal_extractor_v2_transcription_embedding":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Dense vector embeddings (1024D) for transcription text via E5-Large.","title":"Multimodal Extractor V2 Transcription Embedding"},"internal_metadata":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Internal processing metadata","title":"Internal Metadata"}},"required":["start_time","end_time"],"title":"MultimodalExtractorOutput","type":"object"},"parameter_schema":{"$defs":{"GenerationConfig":{"description":"Configuration for generative models.","properties":{"candidate_count":{"default":1,"description":"Number of candidate responses to generate for video description.","title":"Candidate Count","type":"integer"},"max_output_tokens":{"default":1024,"description":"Maximum number of tokens for the generated video description.","title":"Max Output Tokens","type":"integer"},"temperature":{"default":0.2,"description":"Controls randomness for video description generation. Higher is more random.","title":"Temperature","type":"number"},"top_p":{"default":0.8,"description":"Nucleus sampling (top-p) for video description generation.","title":"Top P","type":"number"},"response_mime_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"MIME type for response (e.g., 'application/json')","title":"Response Mime Type"},"response_schema":{"anyOf":[{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"JSON schema for structured output","title":"Response Schema"}},"title":"GenerationConfig","type":"object"},"SplitMethod":{"enum":["time","scene","silence"],"title":"SplitMethod","type":"string"}},"description":"Parameters for multimodal extractor v2.\n\nSame pipeline as v1 but uses Gemini Embedding 2 (3072D) for the\nmultimodal embedding step. Supports configurable output dimensions\nvia Matryoshka representation learning (3072/1536/768).","examples":[{"description":"Standard video processing with Gemini Embedding 2","enable_thumbnails":true,"extractor_type":"multimodal_extractor","output_dimensionality":3072,"run_multimodal_embedding":true,"split_method":"time","time_split_interval":10},{"description":"Compact embeddings for high-scale use cases","extractor_type":"multimodal_extractor","output_dimensionality":768,"run_multimodal_embedding":true,"split_method":"time","time_split_interval":10}],"properties":{"extractor_type":{"const":"multimodal_extractor","default":"multimodal_extractor","description":"Discriminator field. Must be 'multimodal_extractor'.","title":"Extractor Type","type":"string"},"split_method":{"$ref":"#/$defs/SplitMethod","default":"time","description":"Video splitting strategy."},"description_prompt":{"default":"Watch this video segment carefully and describe exactly what you see. Do not make up or infer details that are not visible in the footage. Include: who is shown (gender, appearance, actions), what they are doing, the setting/location, and any products, text, or branding visible on screen.","description":"Prompt for description generation.","title":"Description Prompt","type":"string"},"time_split_interval":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":10,"description":"Interval in seconds for 'time' splitting.","title":"Time Split Interval"},"silence_db_threshold":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Decibel threshold for silence detection. Recommended: -40.","title":"Silence Db Threshold"},"scene_detection_threshold":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Scene detection threshold (0.0-1.0). Recommended: 0.5.","title":"Scene Detection Threshold"},"run_transcription":{"default":false,"description":"Run Whisper transcription on segments.","title":"Run Transcription","type":"boolean"},"transcription_language":{"default":"en","description":"Transcription language code.","title":"Transcription Language","type":"string"},"run_video_description":{"default":false,"description":"Generate Gemini descriptions for segments.","title":"Run Video Description","type":"boolean"},"run_transcription_embedding":{"default":false,"description":"Generate E5 embeddings for transcriptions (1024D).","title":"Run Transcription Embedding","type":"boolean"},"run_multimodal_embedding":{"default":true,"description":"Generate Gemini Embedding 2 multimodal embeddings (3072D). Creates unified embeddings across video, image, text, audio, and GIF content.","title":"Run Multimodal Embedding","type":"boolean"},"run_ocr":{"default":false,"description":"Extract text from video frames via Gemini OCR.","title":"Run Ocr","type":"boolean"},"max_segment_duration":{"anyOf":[{"type":"number"},{"type":"null"}],"default":30.0,"description":"Maximum duration in seconds for any single segment. Scene/silence segments longer than this are subdivided. Set to None to disable. Default: 30s.","title":"Max Segment Duration"},"sensitivity":{"default":"low","description":"Scene detection sensitivity.","title":"Sensitivity","type":"string"},"enable_thumbnails":{"default":true,"description":"Generate thumbnail images for segments.","title":"Enable Thumbnails","type":"boolean"},"use_cdn":{"default":false,"description":"Use CloudFront CDN for thumbnail delivery.","title":"Use Cdn","type":"boolean"},"generation_config":{"$ref":"#/$defs/GenerationConfig"},"output_dimensionality":{"default":3072,"description":"Output embedding dimensions. Gemini Embedding 2 supports Matryoshka dimension reduction: 3072 (full), 1536, or 768.","title":"Output Dimensionality","type":"integer"},"task_type":{"default":"RETRIEVAL_DOCUMENT","description":"Embedding task type hint. Options: RETRIEVAL_DOCUMENT, RETRIEVAL_QUERY, SEMANTIC_SIMILARITY, CLASSIFICATION.","title":"Task Type","type":"string"},"response_shape":{"anyOf":[{"type":"string"},{"additionalProperties":true,"type":"object"},{"type":"null"}],"default":null,"description":"Custom structured output schema for Gemini extraction. String for natural language prompt, dict for explicit JSON schema.","title":"Response Shape"}},"title":"MultimodalExtractorParams","type":"object"},"supported_input_types":["video","image","audio","text","string"],"max_inputs":{"video":1,"image":1,"audio":1,"text":1,"string":1},"default_parameters":{},"costs":{"tier":4,"tier_label":"PREMIUM","rates":[{"unit":"minute","credits_per_unit":50,"description":"Video processing per minute"},{"unit":"image","credits_per_unit":5,"description":"Image analysis"},{"unit":"1k_tokens","credits_per_unit":2,"description":"Text processing per 1K tokens"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://multimodal_extractor@v2/gemini-embedding-2","name":"gemini-embedding-2","description":"Vector index for Gemini Embedding 2 multimodal embeddings.","type":"single","index":{"name":"multimodal_extractor_v2_multimodal_embedding","description":"Dense 3072D embedding via Gemini Embedding 2 for all content types.","dimensions":3072,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["video","text","image","audio"],"inference_name":"google__gemini_embedding_2","inference_service_id":"google/gemini-embedding-2","purpose":null,"vector_name_override":null,"supports_multi_query":false}},{"feature_uri":"mixpeek://multimodal_extractor@v2/multilingual_e5_large_instruct_v1","name":"multilingual_e5_large_instruct_v1","description":"Vector index for transcription embeddings.","type":"single","index":{"name":"multimodal_extractor_v2_transcription_embedding","description":"Dense 1024D embedding for transcriptions via E5-Large.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["string","text"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":null,"vector_name_override":null,"supports_multi_query":false}}],"required_payload_indexes":[],"position_fields":["start_time","end_time"],"capabilities":["batch","realtime"],"example_usage":{"namespace":{"feature_extractors":[{"name":"multimodal_extractor","version":"v2"}]},"collection":{"feature_extractor":{"name":"multimodal_extractor","version":"v2","input_mappings":{"video":"<your_video_field>","image":"<your_image_field>","text":"<your_text_field>","gif":"<your_gif_field>","custom_thumbnail":"<your_custom_thumbnail_field>"},"parameters":{"split_method":"time","description_prompt":"Watch this video segment carefully and describe exactly what you see. Do not make up or infer details that are not visible in the footage. Include: who is shown (gender, appearance, actions), what they are doing, the setting/location, and any products, text, or branding visible on screen.","time_split_interval":10,"run_transcription":false,"transcription_language":"en","run_video_description":false,"run_transcription_embedding":false,"run_multimodal_embedding":true,"run_ocr":false,"max_segment_duration":30.0,"sensitivity":"low","enable_thumbnails":true,"use_cdn":false,"output_dimensionality":3072,"task_type":"RETRIEVAL_DOCUMENT"}}}}}