{"feature_extractor_name":"course_content_extractor","version":"v1","feature_extractor_id":"course_content_extractor_v1","description":"**Educational content extractor** for VIDEO lectures, PDF slides, and CODE archives.\n\nDecomposes educational materials into atomic learning units optimized for retrieval:\n- **Video**: Transcript-based segmentation with SRT support, OCR for screen text\n- **PDF**: Layout-aware extraction of text, figures, tables, and code blocks\n- **Code**: Function-level segmentation with language detection\n\n**Multi-modal Embeddings:**\n- E5 (1024D): Transcripts, slide text, descriptions\n- Jina Code (768D): Code snippets and functions\n- SigLIP (768D): Semantic visual content (what is shown)\n- DINOv2 (768D): Visual structure/layout comparison (how it looks)\n\n**Use for:** Online courses, lecture archives, code tutorials, technical training.","icon":"graduation-cap","source":"builtin","input_schema":{"description":"Input schema for course content extractor.","properties":{"video":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to video file.","title":"Video"},"srt":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to SRT subtitle file.","title":"Srt"},"pdf":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to PDF document.","title":"Pdf"},"code_archive":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL or S3 path to ZIP archive containing source code.","title":"Code Archive"}},"title":"CourseContentExtractorInput","type":"object"},"output_schema":{"description":"Output schema for a single atomic learning unit.","properties":{"unit_type":{"description":"Type of learning unit.","title":"Unit Type","type":"string"},"doc_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Granular document type when expand_to_granular_docs=True.","title":"Doc Type"},"parent_segment_id":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"ID of parent video segment (for granular docs).","title":"Parent Segment Id"},"segment_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Index of this segment within the video.","title":"Segment Index"},"title":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Generated or extracted title for this unit.","title":"Title"},"start_time":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"Start time in seconds.","title":"Start Time"},"end_time":{"anyOf":[{"type":"number"},{"type":"null"}],"default":null,"description":"End time in seconds.","title":"End Time"},"page_number":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Page number (1-indexed).","title":"Page Number"},"element_index":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Element index within page.","title":"Element Index"},"start_line":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"Start line number.","title":"Start Line"},"end_line":{"anyOf":[{"type":"integer"},{"type":"null"}],"default":null,"description":"End line number.","title":"End Line"},"text_content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Primary text content.","title":"Text Content"},"screen_text":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"OCR-extracted text.","title":"Screen Text"},"code_content":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Code snippet or function.","title":"Code Content"},"code_language":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Detected programming language.","title":"Code Language"},"element_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"PDF element type.","title":"Element Type"},"thumbnail_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"S3 URL of thumbnail.","title":"Thumbnail Url"},"intfloat__multilingual_e5_large_instruct":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"E5 text embedding (1024D).","title":"Intfloat  Multilingual E5 Large Instruct"},"jinaai__jina_embeddings_v2_base_code":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"Jina code embedding (768D).","title":"Jinaai  Jina Embeddings V2 Base Code"},"google__siglip_base_patch16_224":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"SigLIP visual embedding (768D).","title":"Google  Siglip Base Patch16 224"},"facebook__dinov2_base":{"anyOf":[{"items":{"type":"number"},"type":"array"},{"type":"null"}],"default":null,"description":"DINOv2 visual structure embedding (768D).","title":"Facebook  Dinov2 Base"},"vlm_frame_type":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Frame classification: product_ui, code, presenter, slide, generic_console, diagram.","title":"Vlm Frame Type"},"vlm_page_context":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"Specific product page/section shown (e.g., 'Guardrails - Add Denied Topics').","title":"Vlm Page Context"},"vlm_ui_labels":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Interactive UI element labels extracted from main content area.","title":"Vlm Ui Labels"},"vlm_workflow_steps":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Ordered workflow steps if a multi-step process is shown.","title":"Vlm Workflow Steps"},"vlm_config_options":{"anyOf":[{"items":{"type":"string"},"type":"array"},{"type":"null"}],"default":null,"description":"Configuration options/settings visible in the frame.","title":"Vlm Config Options"},"source_url":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"URL of source file.","title":"Source Url"},"llm_summary":{"anyOf":[{"type":"string"},{"type":"null"}],"default":null,"description":"LLM-generated summary.","title":"Llm Summary"}},"required":["unit_type"],"title":"CourseContentExtractorOutput","type":"object"},"parameter_schema":{"description":"Parameters for the course content extractor.","examples":[{"description":"Standard video processing with all embeddings","extractor_type":"course_content_extractor","run_code_embedding":true,"run_structure_embedding":true,"run_text_embedding":true,"run_visual_embedding":true,"target_segment_duration_ms":120000}],"properties":{"extractor_type":{"const":"course_content_extractor","default":"course_content_extractor","description":"Discriminator field. Must be 'course_content_extractor'.","title":"Extractor Type","type":"string"},"target_segment_duration_ms":{"default":120000,"description":"Target duration for video segments in milliseconds.","maximum":600000,"minimum":30000,"title":"Target Segment Duration Ms","type":"integer"},"min_segment_duration_ms":{"default":30000,"description":"Minimum duration for video segments in milliseconds.","minimum":10000,"title":"Min Segment Duration Ms","type":"integer"},"segmentation_method":{"default":"scene","description":"Video segmentation method: 'scene', 'srt', or 'time'.","enum":["scene","srt","time"],"title":"Segmentation Method","type":"string"},"scene_detection_threshold":{"default":0.3,"description":"Scene detection sensitivity (0.0-1.0).","maximum":0.9,"minimum":0.1,"title":"Scene Detection Threshold","type":"number"},"use_whisper_asr":{"default":true,"description":"Use Whisper ASR for transcription instead of SRT subtitles.","title":"Use Whisper Asr","type":"boolean"},"expand_to_granular_docs":{"default":true,"description":"Expand each segment into multiple granular documents.","title":"Expand To Granular Docs","type":"boolean"},"ocr_frames_per_segment":{"default":3,"description":"Number of frames to OCR per video segment.","maximum":10,"minimum":1,"title":"Ocr Frames Per Segment","type":"integer"},"pdf_extraction_mode":{"default":"per_element","description":"How to extract PDF content: 'per_page' or 'per_element'.","enum":["per_page","per_element"],"title":"Pdf Extraction Mode","type":"string"},"pdf_render_dpi":{"default":150,"description":"DPI for rendering PDF pages/elements as images.","maximum":300,"minimum":72,"title":"Pdf Render Dpi","type":"integer"},"detect_code_in_pdf":{"default":true,"description":"Whether to detect code blocks in PDF text.","title":"Detect Code In Pdf","type":"boolean"},"segment_functions":{"default":true,"description":"Whether to segment code files into individual functions.","title":"Segment Functions","type":"boolean"},"supported_languages":{"description":"Programming languages to extract from code archives.","items":{"type":"string"},"title":"Supported Languages","type":"array"},"run_text_embedding":{"default":true,"description":"Generate E5 text embeddings (1024D) for transcripts and text.","title":"Run Text Embedding","type":"boolean"},"run_code_embedding":{"default":true,"description":"Generate Jina Code embeddings (768D) for code snippets.","title":"Run Code Embedding","type":"boolean"},"run_visual_embedding":{"default":true,"description":"Generate SigLIP visual embeddings (768D) for video frames.","title":"Run Visual Embedding","type":"boolean"},"run_structure_embedding":{"default":true,"description":"Generate DINOv2 visual structure embeddings (768D) for layout comparison.","title":"Run Structure Embedding","type":"boolean"},"visual_embedding_use_case":{"default":"lecture","description":"Content type preset for visual embedding strategy.","enum":["lecture","code_demo","tutorial","presentation","dynamic"],"title":"Visual Embedding Use Case","type":"string"},"extract_screen_text":{"default":true,"description":"Run OCR on video frames to extract on-screen text.","title":"Extract Screen Text","type":"boolean"},"generate_thumbnails":{"default":true,"description":"Generate thumbnail images for each learning unit.","title":"Generate Thumbnails","type":"boolean"},"use_cdn":{"default":false,"description":"Use CDN for thumbnail delivery.","title":"Use Cdn","type":"boolean"},"run_vlm_frame_analysis":{"default":false,"description":"Run VLM on video frame thumbnails to extract structured fields: frame_type, page_context, ui_labels, workflow_steps, config_options. Enables drift detection and UI comparison use cases.","title":"Run Vlm Frame Analysis","type":"boolean"},"vlm_provider":{"default":"google","description":"VLM provider: 'google' (Gemini API) or 'vllm' (local GPU with Qwen2.5-VL).","enum":["google","vllm"],"title":"Vlm Provider","type":"string"},"vlm_model":{"default":"gemini-2.5-flash","description":"VLM model. For google: 'gemini-2.5-flash'. For vllm: 'Qwen/Qwen2.5-VL-7B-Instruct'.","title":"Vlm Model","type":"string"},"enrich_with_llm":{"default":false,"description":"Use Gemini to generate summaries and enhance descriptions.","title":"Enrich With Llm","type":"boolean"},"llm_prompt":{"default":"Summarize this educational content segment, highlighting key concepts.","description":"Prompt for LLM enrichment when enrich_with_llm=True.","title":"Llm Prompt","type":"string"}},"title":"CourseContentExtractorParams","type":"object"},"supported_input_types":["video","pdf","text"],"max_inputs":{"video":1,"pdf":1,"text":1},"default_parameters":{},"costs":{"tier":2,"tier_label":"MODERATE","rates":[{"unit":"minute","credits_per_unit":20,"description":"Video lecture segmentation per minute"},{"unit":"page","credits_per_unit":5,"description":"PDF slide/document page processing"},{"unit":"1k_tokens","credits_per_unit":2,"description":"Text and code embedding per 1K tokens"}]},"required_vector_indexes":[{"feature_uri":"mixpeek://course_content_extractor@v1/intfloat__multilingual_e5_large_instruct","name":"intfloat__multilingual_e5_large_instruct","description":"Vector index for text content embeddings.","type":"single","index":{"name":"intfloat__multilingual_e5_large_instruct","description":"E5 text embedding for transcripts, slide text, descriptions.","dimensions":1024,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text","string"],"inference_name":"intfloat__multilingual_e5_large_instruct","inference_service_id":"intfloat/multilingual-e5-large-instruct","purpose":"text","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/jinaai__jina_embeddings_v2_base_code","name":"jinaai__jina_embeddings_v2_base_code","description":"Vector index for code embeddings.","type":"single","index":{"name":"jinaai__jina_embeddings_v2_base_code","description":"Jina code embedding for code snippets and functions.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["text","string"],"inference_name":"jinaai__jina_embeddings_v2_base_code","inference_service_id":"jinaai/jina-embeddings-v2-base-code","purpose":"code","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/google__siglip_base_patch16_224","name":"google__siglip_base_patch16_224","description":"Vector index for visual content embeddings.","type":"single","index":{"name":"google__siglip_base_patch16_224","description":"SigLIP visual embedding with multi-frame aggregation.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image","video"],"inference_name":"google__siglip_base_patch16_224","inference_service_id":"google/siglip-base-patch16-224","purpose":"image","vector_name_override":null}},{"feature_uri":"mixpeek://course_content_extractor@v1/facebook__dinov2_base","name":"facebook__dinov2_base","description":"Vector index for visual structure embeddings.","type":"single","index":{"name":"facebook__dinov2_base","description":"DINOv2 visual structure embedding for fine-grained layout comparison.","dimensions":768,"type":"dense","distance":"Cosine","datatype":"float32","on_disk":null,"supported_inputs":["image","video"],"inference_name":"facebook__dinov2_base","inference_service_id":"facebook/dinov2-base","purpose":"image","vector_name_override":null}}],"required_payload_indexes":[],"position_fields":["start_time","end_time","page_number","element_index","doc_type"]}