Coverage for src/refinire/agents/extractor.py: 71%
343 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-15 18:51 +0900
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-15 18:51 +0900
1"""
2ExtractorAgent implementation for information extraction from unstructured data.
4ExtractorAgentは非構造化データから情報抽出を行うエージェントです。
5テキスト、HTML、JSONなどの様々な形式から特定の情報を抽出し、
6構造化されたデータとして出力します。
7"""
9import logging
10import re
11import json
12from abc import ABC, abstractmethod
13from typing import Any, List, Optional, Dict, Union, Callable
14from pydantic import BaseModel, Field, field_validator
15from datetime import datetime
16import html
17from html.parser import HTMLParser
18from urllib.parse import urlparse
20from .flow.context import Context
21from .flow.step import Step
22from .pipeline.llm_pipeline import LLMPipeline
24logger = logging.getLogger(__name__)
27class ExtractionRule(ABC):
28 """
29 Abstract base class for extraction rules.
30 抽出ルールの抽象基底クラス。
31 """
33 def __init__(self, name: str, description: str = ""):
34 """
35 Initialize extraction rule.
36 抽出ルールを初期化します。
38 Args:
39 name: Rule name / ルール名
40 description: Rule description / ルールの説明
41 """
42 self.name = name
43 self.description = description
45 @abstractmethod
46 def extract(self, data: str, context: Context) -> Any:
47 """
48 Extract information from the data.
49 データから情報を抽出します。
51 Args:
52 data: Input data to extract from / 抽出対象の入力データ
53 context: Execution context / 実行コンテキスト
55 Returns:
56 Any: Extracted information / 抽出された情報
57 """
58 pass
61class RegexExtractionRule(ExtractionRule):
62 """
63 Rule to extract information using regular expressions.
64 正規表現を使って情報を抽出するルール。
65 """
67 def __init__(self, name: str, pattern: str, group: int = 0, multiple: bool = False):
68 """
69 Initialize regex extraction rule.
70 正規表現抽出ルールを初期化します。
72 Args:
73 name: Rule name / ルール名
74 pattern: Regular expression pattern / 正規表現パターン
75 group: Capture group number to extract / 抽出するキャプチャグループ番号
76 multiple: Whether to extract all matches / 全てのマッチを抽出するかどうか
77 """
78 super().__init__(name, f"Extract using regex pattern: {pattern}")
79 self.pattern = re.compile(pattern, re.IGNORECASE | re.MULTILINE)
80 self.group = group
81 self.multiple = multiple
83 def extract(self, data: str, context: Context) -> Union[str, List[str], None]:
84 """Extract using regex pattern."""
85 if not isinstance(data, str):
86 return None
88 if self.multiple:
89 matches = self.pattern.findall(data)
90 return matches if matches else []
91 else:
92 match = self.pattern.search(data)
93 if match:
94 return match.group(self.group)
95 return None
98class EmailExtractionRule(RegexExtractionRule):
99 """
100 Rule to extract email addresses.
101 メールアドレスを抽出するルール。
102 """
104 def __init__(self, name: str = "email_extractor", multiple: bool = True):
105 pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
106 super().__init__(name, pattern, group=0, multiple=multiple)
109class PhoneExtractionRule(RegexExtractionRule):
110 """
111 Rule to extract phone numbers.
112 電話番号を抽出するルール。
113 """
115 def __init__(self, name: str = "phone_extractor", multiple: bool = True):
116 # Pattern for various phone number formats
117 pattern = r'(\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})'
118 super().__init__(name, pattern, group=0, multiple=multiple)
121class URLExtractionRule(RegexExtractionRule):
122 """
123 Rule to extract URLs.
124 URLを抽出するルール。
125 """
127 def __init__(self, name: str = "url_extractor", multiple: bool = True):
128 pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
129 super().__init__(name, pattern, group=0, multiple=multiple)
132class DateExtractionRule(RegexExtractionRule):
133 """
134 Rule to extract dates in various formats.
135 様々な形式の日付を抽出するルール。
136 """
138 def __init__(self, name: str = "date_extractor", multiple: bool = True):
139 # Pattern for common date formats
140 pattern = r'\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})\b'
141 super().__init__(name, pattern, group=0, multiple=multiple)
144class SimpleHTMLParser(HTMLParser):
145 """
146 Simple HTML parser using standard library.
147 標準ライブラリを使ったシンプルなHTMLパーサー。
148 """
150 def __init__(self, target_tag: str = None, target_attribute: str = None):
151 super().__init__()
152 self.target_tag = target_tag.lower() if target_tag else None
153 self.target_attribute = target_attribute
154 self.results = []
155 self.current_tag = None
156 self.current_attrs = {}
157 self.current_data = ""
158 self.capture_data = False
160 def handle_starttag(self, tag, attrs):
161 self.current_tag = tag.lower()
162 self.current_attrs = dict(attrs)
164 if self.target_tag and tag.lower() == self.target_tag:
165 self.capture_data = True
166 self.current_data = ""
168 # If we want an attribute, extract it now
169 if self.target_attribute and self.target_attribute in self.current_attrs:
170 self.results.append(self.current_attrs[self.target_attribute])
171 self.capture_data = False
173 def handle_endtag(self, tag):
174 if self.capture_data and tag.lower() == self.target_tag:
175 if self.current_data.strip():
176 self.results.append(self.current_data.strip())
177 self.capture_data = False
178 self.current_data = ""
180 def handle_data(self, data):
181 if self.capture_data:
182 self.current_data += data
185class HTMLExtractionRule(ExtractionRule):
186 """
187 Rule to extract information from HTML using basic tag matching.
188 基本的なタグマッチングを使ってHTMLから情報を抽出するルール。
189 """
191 def __init__(self, name: str, tag: str = None, attribute: str = None, multiple: bool = False):
192 """
193 Initialize HTML extraction rule.
194 HTML抽出ルールを初期化します。
196 Args:
197 name: Rule name / ルール名
198 tag: HTML tag name / HTMLタグ名
199 attribute: Attribute to extract / 抽出する属性
200 multiple: Whether to extract from all matching elements / 全てのマッチする要素から抽出するかどうか
201 """
202 super().__init__(name, f"Extract from HTML using tag: {tag}")
203 self.tag = tag
204 self.attribute = attribute
205 self.multiple = multiple
207 def extract(self, data: str, context: Context) -> Union[str, List[str], None]:
208 """Extract from HTML using standard library HTMLParser."""
209 try:
210 if not self.tag:
211 # If no tag specified, try to extract all text content
212 parser = HTMLParser()
213 parser.feed(data)
214 # Simple text extraction - remove HTML tags
215 clean_text = re.sub(r'<[^>]+>', ' ', data)
216 clean_text = re.sub(r'\s+', ' ', clean_text).strip()
217 return clean_text if clean_text else None
219 # Use custom parser for specific tag extraction
220 parser = SimpleHTMLParser(self.tag, self.attribute)
221 parser.feed(data)
223 results = parser.results
225 if not results:
226 return [] if self.multiple else None
228 if self.multiple:
229 return results
230 else:
231 return results[0] if results else None
233 except Exception as e:
234 logger.warning(f"HTML extraction error: {e}")
235 return [] if self.multiple else None
238class JSONExtractionRule(ExtractionRule):
239 """
240 Rule to extract information from JSON using JSONPath-like syntax.
241 JSONPath類似の構文を使ってJSONから情報を抽出するルール。
242 """
244 def __init__(self, name: str, path: str, multiple: bool = False):
245 """
246 Initialize JSON extraction rule.
247 JSON抽出ルールを初期化します。
249 Args:
250 name: Rule name / ルール名
251 path: JSONPath-like expression (e.g., "data.items.*.name") / JSONPath類似の式
252 multiple: Whether to extract all matches / 全てのマッチを抽出するかどうか
253 """
254 super().__init__(name, f"Extract from JSON using path: {path}")
255 self.path = path.split('.')
256 self.multiple = multiple
258 def extract(self, data: str, context: Context) -> Union[Any, List[Any], None]:
259 """Extract from JSON using simple path traversal."""
260 try:
261 if isinstance(data, str):
262 json_data = json.loads(data)
263 else:
264 json_data = data
266 results = self._extract_from_path(json_data, self.path, 0)
268 if self.multiple:
269 return results if isinstance(results, list) else [results] if results is not None else []
270 else:
271 return results[0] if isinstance(results, list) and results else results
273 except Exception as e:
274 logger.warning(f"JSON extraction error: {e}")
275 return [] if self.multiple else None
277 def _extract_from_path(self, data: Any, path: List[str], index: int) -> Any:
278 """Recursively extract data following the path."""
279 if index >= len(path):
280 return data
282 key = path[index]
284 if key == '*':
285 # Wildcard - extract from all items
286 if isinstance(data, list):
287 results = []
288 for item in data:
289 result = self._extract_from_path(item, path, index + 1)
290 if result is not None:
291 if isinstance(result, list):
292 results.extend(result)
293 else:
294 results.append(result)
295 return results
296 elif isinstance(data, dict):
297 results = []
298 for value in data.values():
299 result = self._extract_from_path(value, path, index + 1)
300 if result is not None:
301 if isinstance(result, list):
302 results.extend(result)
303 else:
304 results.append(result)
305 return results
306 else:
307 return None
308 else:
309 # Specific key
310 if isinstance(data, dict) and key in data:
311 return self._extract_from_path(data[key], path, index + 1)
312 elif isinstance(data, list) and key.isdigit():
313 idx = int(key)
314 if 0 <= idx < len(data):
315 return self._extract_from_path(data[idx], path, index + 1)
317 return None
320class LLMExtractionRule(ExtractionRule):
321 """
322 Rule to extract information using LLM with natural language prompts.
323 自然言語プロンプトを使ってLLMで情報を抽出するルール。
324 """
326 def __init__(self, name: str, prompt: str, llm_pipeline: LLMPipeline = None,
327 output_format: str = "text", multiple: bool = False):
328 """
329 Initialize LLM extraction rule.
330 LLM抽出ルールを初期化します。
332 Args:
333 name: Rule name / ルール名
334 prompt: Extraction prompt / 抽出プロンプト
335 llm_pipeline: LLM pipeline to use / 使用するLLMパイプライン
336 output_format: Expected output format ("text", "json", "list") / 期待する出力形式
337 multiple: Whether to extract multiple items / 複数のアイテムを抽出するかどうか
338 """
339 super().__init__(name, f"Extract using LLM with prompt: {prompt[:50]}...")
340 self.prompt = prompt
341 self.llm_pipeline = llm_pipeline
342 self.output_format = output_format.lower()
343 self.multiple = multiple
345 def extract(self, data: str, context: Context) -> Union[Any, List[Any], None]:
346 """Extract using LLM pipeline."""
347 if not self.llm_pipeline:
348 logger.warning(f"No LLM pipeline provided for {self.name}")
349 return [] if self.multiple else None
351 try:
352 # Create extraction prompt
353 full_prompt = f"{self.prompt}\n\nInput data:\n{data}\n\nExtracted information:"
355 # Use LLM to extract
356 result = self.llm_pipeline.generate(full_prompt)
358 if not result.success or not result.content:
359 return [] if self.multiple else None
361 extracted_text = result.content.strip()
363 # Parse output based on format
364 if self.output_format == "json":
365 try:
366 parsed = json.loads(extracted_text)
367 return parsed
368 except json.JSONDecodeError:
369 logger.warning(f"Failed to parse JSON output: {extracted_text}")
370 return [] if self.multiple else None
372 elif self.output_format == "list":
373 # Split by lines and clean up
374 lines = [line.strip() for line in extracted_text.split('\n') if line.strip()]
375 return lines
377 else: # text format
378 return extracted_text if extracted_text else ([] if self.multiple else None)
380 except Exception as e:
381 logger.error(f"LLM extraction error in {self.name}: {e}")
382 return [] if self.multiple else None
385class CustomFunctionExtractionRule(ExtractionRule):
386 """
387 Rule using a custom extraction function.
388 カスタム抽出関数を使用するルール。
389 """
391 def __init__(self, name: str, extraction_func: Callable[[str, Context], Any]):
392 """
393 Initialize custom function extraction rule.
394 カスタム関数抽出ルールを初期化します。
396 Args:
397 name: Rule name / ルール名
398 extraction_func: Custom extraction function / カスタム抽出関数
399 """
400 super().__init__(name, "Custom extraction function")
401 self.extraction_func = extraction_func
403 def extract(self, data: str, context: Context) -> Any:
404 """Extract using custom function."""
405 try:
406 return self.extraction_func(data, context)
407 except Exception as e:
408 logger.warning(f"Custom extraction function error: {e}")
409 return None
412class ExtractionResult:
413 """
414 Result of extraction operation.
415 抽出操作の結果。
416 """
418 def __init__(self, extracted_data: Dict[str, Any], success: bool = True,
419 errors: List[str] = None, warnings: List[str] = None):
420 """
421 Initialize extraction result.
422 抽出結果を初期化します。
424 Args:
425 extracted_data: Extracted data by rule name / ルール名別の抽出データ
426 success: Whether extraction was successful / 抽出が成功したかどうか
427 errors: List of error messages / エラーメッセージのリスト
428 warnings: List of warning messages / 警告メッセージのリスト
429 """
430 self.extracted_data = extracted_data
431 self.success = success
432 self.errors = errors or []
433 self.warnings = warnings or []
434 self.timestamp = datetime.now()
436 def add_error(self, error: str):
437 """Add an error message."""
438 self.errors.append(error)
439 self.success = False
441 def add_warning(self, warning: str):
442 """Add a warning message."""
443 self.warnings.append(warning)
445 def get_extracted(self, rule_name: str) -> Any:
446 """Get extracted data for a specific rule."""
447 return self.extracted_data.get(rule_name)
449 def __str__(self) -> str:
450 status = "SUCCESS" if self.success else "FAILED"
451 return f"ExtractionResult({status}, {len(self.extracted_data)} rules, {len(self.errors)} errors)"
454class ExtractorConfig(BaseModel):
455 """
456 Configuration for ExtractorAgent.
457 ExtractorAgentの設定。
458 """
460 name: str = Field(description="Name of the extractor agent / エクストラクターエージェントの名前")
462 rules: List[Dict[str, Any]] = Field(
463 default=[],
464 description="List of extraction rules configuration / 抽出ルール設定のリスト"
465 )
467 input_format: str = Field(
468 default="auto",
469 description="Expected input format (auto, text, html, json) / 期待する入力形式"
470 )
472 store_result: bool = Field(
473 default=True,
474 description="Store extraction result in context / 抽出結果をコンテキストに保存"
475 )
477 fail_on_error: bool = Field(
478 default=False,
479 description="Fail if any extraction rule fails / いずれかの抽出ルールが失敗した場合に失敗する"
480 )
482 @field_validator("input_format")
483 @classmethod
484 def validate_input_format(cls, v):
485 """Validate input format."""
486 allowed_formats = ["auto", "text", "html", "json"]
487 if v not in allowed_formats:
488 raise ValueError(f"input_format must be one of {allowed_formats}")
489 return v
492class ExtractorAgent(Step):
493 """
494 Extractor agent for information extraction from unstructured data.
495 非構造化データから情報抽出を行うエクストラクターエージェント。
497 The ExtractorAgent processes input data and extracts specific information
498 using configured extraction rules (regex, HTML, JSON, LLM-based).
499 ExtractorAgentは入力データを処理し、設定された抽出ルール
500 (正規表現、HTML、JSON、LLMベース)を使って特定の情報を抽出します。
501 """
503 def __init__(self, config: ExtractorConfig, custom_rules: List[ExtractionRule] = None,
504 llm_pipeline: LLMPipeline = None):
505 """
506 Initialize ExtractorAgent.
507 ExtractorAgentを初期化します。
509 Args:
510 config: Extractor configuration / エクストラクター設定
511 custom_rules: Optional custom extraction rules / オプションのカスタム抽出ルール
512 llm_pipeline: LLM pipeline for LLM-based extraction / LLMベース抽出用のLLMパイプライン
513 """
514 super().__init__(name=config.name)
515 self.config = config
516 self.llm_pipeline = llm_pipeline
517 self.extraction_rules = self._build_extraction_rules(custom_rules or [])
519 def _build_extraction_rules(self, custom_rules: List[ExtractionRule]) -> List[ExtractionRule]:
520 """
521 Build extraction rules from configuration and custom rules.
522 設定とカスタムルールから抽出ルールを構築します。
523 """
524 rules = list(custom_rules)
526 # Build rules from configuration
527 # 設定からルールを構築
528 for rule_config in self.config.rules:
529 rule_type = rule_config.get("type")
530 rule_name = rule_config.get("name", rule_type)
532 if rule_type == "regex":
533 pattern = rule_config.get("pattern")
534 group = rule_config.get("group", 0)
535 multiple = rule_config.get("multiple", False)
536 if pattern:
537 rules.append(RegexExtractionRule(rule_name, pattern, group, multiple))
539 elif rule_type == "email":
540 multiple = rule_config.get("multiple", True)
541 rules.append(EmailExtractionRule(rule_name, multiple))
543 elif rule_type == "phone":
544 multiple = rule_config.get("multiple", True)
545 rules.append(PhoneExtractionRule(rule_name, multiple))
547 elif rule_type == "url":
548 multiple = rule_config.get("multiple", True)
549 rules.append(URLExtractionRule(rule_name, multiple))
551 elif rule_type == "date":
552 multiple = rule_config.get("multiple", True)
553 rules.append(DateExtractionRule(rule_name, multiple))
555 elif rule_type == "html":
556 tag = rule_config.get("tag")
557 attribute = rule_config.get("attribute")
558 multiple = rule_config.get("multiple", False)
559 rules.append(HTMLExtractionRule(rule_name, tag, attribute, multiple))
561 elif rule_type == "json":
562 path = rule_config.get("path")
563 multiple = rule_config.get("multiple", False)
564 if path:
565 rules.append(JSONExtractionRule(rule_name, path, multiple))
567 elif rule_type == "llm":
568 prompt = rule_config.get("prompt")
569 output_format = rule_config.get("output_format", "text")
570 multiple = rule_config.get("multiple", False)
571 if prompt:
572 rules.append(LLMExtractionRule(rule_name, prompt, self.llm_pipeline,
573 output_format, multiple))
575 else:
576 logger.warning(f"Unknown rule type: {rule_type}")
578 return rules
580 async def run(self, user_input: Optional[str], ctx: Context) -> Context:
581 """
582 Execute the extraction logic.
583 抽出ロジックを実行します。
585 Args:
586 user_input: User input to extract from / 抽出対象のユーザー入力
587 ctx: Execution context / 実行コンテキスト
589 Returns:
590 Context: Updated context with extraction results / 抽出結果を含む更新されたコンテキスト
591 """
592 # Update step info
593 # ステップ情報を更新
594 ctx.update_step_info(self.name)
596 try:
597 # Determine data to extract from
598 # 抽出対象のデータを決定
599 data_to_extract = user_input
600 if data_to_extract is None:
601 data_to_extract = ctx.get_user_input()
603 if not data_to_extract:
604 logger.warning(f"No input data provided for extraction in {self.name}")
605 data_to_extract = ""
607 # Perform extraction
608 # 抽出を実行
609 extraction_result = self._extract_data(data_to_extract, ctx)
611 # Store result in context if requested
612 # 要求された場合は結果をコンテキストに保存
613 if self.config.store_result:
614 ctx.shared_state[f"{self.name}_result"] = {
615 "extracted_data": extraction_result.extracted_data,
616 "success": extraction_result.success,
617 "errors": extraction_result.errors,
618 "warnings": extraction_result.warnings,
619 "timestamp": extraction_result.timestamp.isoformat()
620 }
622 # Handle extraction failure
623 # 抽出失敗を処理
624 if not extraction_result.success:
625 error_summary = f"Extraction failed: {', '.join(extraction_result.errors)}"
627 if self.config.fail_on_error:
628 raise ValueError(error_summary)
630 logger.warning(f"ExtractorAgent '{self.name}': {error_summary}")
631 ctx.shared_state[f"{self.name}_status"] = "failed"
632 else:
633 logger.info(f"ExtractorAgent '{self.name}': Extraction successful, "
634 f"{len(extraction_result.extracted_data)} rules processed")
635 ctx.shared_state[f"{self.name}_status"] = "success"
637 # Add warnings to context if any
638 # 警告があればコンテキストに追加
639 if extraction_result.warnings:
640 ctx.shared_state[f"{self.name}_warnings"] = extraction_result.warnings
642 # Store individual extracted data for easy access
643 # 簡単なアクセスのために個別の抽出データを保存
644 for rule_name, extracted_value in extraction_result.extracted_data.items():
645 ctx.shared_state[f"{self.name}_{rule_name}"] = extracted_value
647 return ctx
649 except Exception as e:
650 logger.error(f"ExtractorAgent '{self.name}' error: {e}")
652 if self.config.store_result:
653 ctx.shared_state[f"{self.name}_result"] = {
654 "extracted_data": {},
655 "success": False,
656 "errors": [str(e)],
657 "warnings": [],
658 "timestamp": datetime.now().isoformat()
659 }
660 ctx.shared_state[f"{self.name}_status"] = "error"
662 if self.config.fail_on_error:
663 raise
665 return ctx
667 def _extract_data(self, data: str, context: Context) -> ExtractionResult:
668 """
669 Extract data using all configured rules.
670 設定された全てのルールを使ってデータを抽出します。
671 """
672 result = ExtractionResult({})
674 for rule in self.extraction_rules:
675 try:
676 extracted_value = rule.extract(data, context)
677 result.extracted_data[rule.name] = extracted_value
679 if extracted_value is None or (isinstance(extracted_value, list) and not extracted_value):
680 result.add_warning(f"Rule '{rule.name}' extracted no data")
682 except Exception as e:
683 error_message = f"Rule '{rule.name}' execution error: {e}"
684 result.add_error(error_message)
685 logger.warning(error_message)
687 return result
689 def add_rule(self, rule: ExtractionRule):
690 """
691 Add an extraction rule to the agent.
692 エージェントに抽出ルールを追加します。
693 """
694 self.extraction_rules.append(rule)
696 def get_rules(self) -> List[ExtractionRule]:
697 """
698 Get all extraction rules.
699 全ての抽出ルールを取得します。
700 """
701 return self.extraction_rules.copy()
704# Utility functions for creating common extractors
705# 一般的なエクストラクターを作成するためのユーティリティ関数
707def create_contact_extractor(name: str = "contact_extractor") -> ExtractorAgent:
708 """
709 Create an extractor for contact information (emails, phones, URLs).
710 連絡先情報(メール、電話、URL)用のエクストラクターを作成します。
711 """
712 config = ExtractorConfig(
713 name=name,
714 rules=[
715 {"type": "email", "name": "emails"},
716 {"type": "phone", "name": "phones"},
717 {"type": "url", "name": "urls"}
718 ]
719 )
720 return ExtractorAgent(config)
723def create_html_extractor(name: str, tags: Dict[str, str]) -> ExtractorAgent:
724 """
725 Create an extractor for HTML content using tag names.
726 タグ名を使ったHTMLコンテンツ用のエクストラクターを作成します。
728 Args:
729 name: Extractor name / エクストラクター名
730 tags: Mapping of rule names to HTML tag names / ルール名からHTMLタグ名へのマッピング
731 """
732 rules = []
733 for rule_name, tag in tags.items():
734 rules.append({
735 "type": "html",
736 "name": rule_name,
737 "tag": tag,
738 "multiple": True
739 })
741 config = ExtractorConfig(name=name, rules=rules)
742 return ExtractorAgent(config)
745def create_json_extractor(name: str, paths: Dict[str, str]) -> ExtractorAgent:
746 """
747 Create an extractor for JSON data using paths.
748 パスを使ったJSONデータ用のエクストラクターを作成します。
750 Args:
751 name: Extractor name / エクストラクター名
752 paths: Mapping of rule names to JSON paths / ルール名からJSONパスへのマッピング
753 """
754 rules = []
755 for rule_name, path in paths.items():
756 rules.append({
757 "type": "json",
758 "name": rule_name,
759 "path": path,
760 "multiple": False
761 })
763 config = ExtractorConfig(name=name, rules=rules)
764 return ExtractorAgent(config)
767def create_llm_extractor(name: str, prompts: Dict[str, str],
768 llm_pipeline: LLMPipeline) -> ExtractorAgent:
769 """
770 Create an extractor using LLM with custom prompts.
771 カスタムプロンプトを持つLLMを使ったエクストラクターを作成します。
773 Args:
774 name: Extractor name / エクストラクター名
775 prompts: Mapping of rule names to extraction prompts / ルール名から抽出プロンプトへのマッピング
776 llm_pipeline: LLM pipeline to use / 使用するLLMパイプライン
777 """
778 rules = []
779 for rule_name, prompt in prompts.items():
780 rules.append({
781 "type": "llm",
782 "name": rule_name,
783 "prompt": prompt,
784 "output_format": "text"
785 })
787 config = ExtractorConfig(name=name, rules=rules)
788 return ExtractorAgent(config, llm_pipeline=llm_pipeline)