208 lines
6.5 KiB
Python
208 lines
6.5 KiB
Python
"""Recursive semantic decomposition: split text into atomic units."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import uuid
|
|
from typing import Any
|
|
|
|
from fusionagi.reasoning.native import analyze_prompt
|
|
from fusionagi.schemas.atomic import (
|
|
AtomicSemanticUnit,
|
|
AtomicUnitType,
|
|
DecompositionResult,
|
|
RelationType,
|
|
SemanticRelation,
|
|
)
|
|
from fusionagi._logger import logger
|
|
|
|
|
|
def _make_unit_id(prefix: str = "asu") -> str:
|
|
"""Generate unique unit ID."""
|
|
return f"{prefix}_{uuid.uuid4().hex[:12]}"
|
|
|
|
|
|
def _is_atomic(text: str, min_words: int = 3) -> bool:
|
|
"""Check if text is irreducible (atomic)."""
|
|
content = " ".join(text.split()).strip()
|
|
if not content or len(content) < 10:
|
|
return True
|
|
words = len(content.split())
|
|
return words <= min_words
|
|
|
|
|
|
def _extract_questions(text: str) -> list[str]:
|
|
"""Extract explicit questions from text."""
|
|
questions: list[str] = []
|
|
content = " ".join(text.split()).strip()
|
|
q_parts = re.split(r"\?+", content)
|
|
for part in q_parts[:-1]:
|
|
q = part.strip()
|
|
if len(q) > 10:
|
|
questions.append(q + "?")
|
|
if not questions and any(w in content.lower() for w in ["how", "what", "why", "when", "where", "who"]):
|
|
questions.append(content)
|
|
return questions[:5]
|
|
|
|
|
|
def _extract_constraints(text: str) -> list[str]:
|
|
"""Extract constraint signals from text."""
|
|
constraints: list[str] = []
|
|
patterns = [
|
|
r"must\s+(\w[\w\s]+?)(?:\.|$)",
|
|
r"should\s+(\w[\w\s]+?)(?:\.|$)",
|
|
r"cannot\s+(\w[\w\s]+?)(?:\.|$)",
|
|
r"require[sd]?\s+(\w[\w\s]+?)(?:\.|$)",
|
|
r"constraint[s]?:\s*(\w[\w\s]+?)(?:\.|$)",
|
|
r"assume[sd]?\s+(\w[\w\s]+?)(?:\.|$)",
|
|
]
|
|
for pat in patterns:
|
|
for m in re.finditer(pat, text, re.I):
|
|
constraints.append(m.group(1).strip())
|
|
return list(dict.fromkeys(constraints))[:10]
|
|
|
|
|
|
def _extract_entities(text: str) -> list[str]:
|
|
"""Extract entity-like phrases."""
|
|
entities = re.findall(r'"([^"]+)"', text)
|
|
entities += re.findall(r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b", text)
|
|
return list(dict.fromkeys(e for e in entities if len(e) > 2))[:10]
|
|
|
|
|
|
def decompose_recursive(
|
|
text: str,
|
|
max_depth: int = 3,
|
|
parent_id: str | None = None,
|
|
current_depth: int = 0,
|
|
source_ref: str | None = None,
|
|
) -> DecompositionResult:
|
|
"""
|
|
Recursively decompose text into atomic semantic units.
|
|
|
|
Extracts entities, constraints, intents, assumptions, questions; recurses
|
|
on non-atomic segments. Integrates with native analyze_prompt for intent
|
|
and domain signals.
|
|
|
|
Args:
|
|
text: Input text to decompose.
|
|
max_depth: Maximum recursion depth.
|
|
parent_id: Parent unit ID for decomposition tree.
|
|
current_depth: Current recursion depth.
|
|
source_ref: Optional source reference.
|
|
|
|
Returns:
|
|
DecompositionResult with units and relations.
|
|
"""
|
|
content = " ".join(text.split()).strip()
|
|
if not content:
|
|
return DecompositionResult(units=[], relations=[], depth=current_depth)
|
|
|
|
units: list[AtomicSemanticUnit] = []
|
|
relations: list[SemanticRelation] = []
|
|
|
|
analysis = analyze_prompt(content)
|
|
|
|
# Root unit for this segment (if not already atomic)
|
|
root_id = _make_unit_id()
|
|
root_unit = AtomicSemanticUnit(
|
|
unit_id=root_id,
|
|
content=content[:500] + ("..." if len(content) > 500 else ""),
|
|
type=AtomicUnitType.INTENT if analysis.intent == "question" else AtomicUnitType.FACT,
|
|
confidence=0.8,
|
|
parent_id=parent_id,
|
|
source_ref=source_ref,
|
|
metadata={"intent": analysis.intent},
|
|
)
|
|
units.append(root_unit)
|
|
if parent_id:
|
|
relations.append(
|
|
SemanticRelation(from_id=parent_id, to_id=root_id, relation_type=RelationType.LOGICAL)
|
|
)
|
|
|
|
# Extract questions as atomic units
|
|
for q in _extract_questions(content):
|
|
q_id = _make_unit_id()
|
|
units.append(
|
|
AtomicSemanticUnit(
|
|
unit_id=q_id,
|
|
content=q,
|
|
type=AtomicUnitType.QUESTION,
|
|
confidence=0.9,
|
|
parent_id=root_id,
|
|
source_ref=source_ref,
|
|
)
|
|
)
|
|
relations.append(
|
|
SemanticRelation(from_id=root_id, to_id=q_id, relation_type=RelationType.LOGICAL)
|
|
)
|
|
|
|
# Extract constraints as atomic units
|
|
for c in _extract_constraints(content):
|
|
c_id = _make_unit_id()
|
|
units.append(
|
|
AtomicSemanticUnit(
|
|
unit_id=c_id,
|
|
content=c,
|
|
type=AtomicUnitType.CONSTRAINT,
|
|
confidence=0.85,
|
|
parent_id=root_id,
|
|
source_ref=source_ref,
|
|
)
|
|
)
|
|
relations.append(
|
|
SemanticRelation(from_id=root_id, to_id=c_id, relation_type=RelationType.LOGICAL)
|
|
)
|
|
|
|
# Extract entities as atomic units
|
|
for e in _extract_entities(content):
|
|
e_id = _make_unit_id()
|
|
units.append(
|
|
AtomicSemanticUnit(
|
|
unit_id=e_id,
|
|
content=e,
|
|
type=AtomicUnitType.FACT,
|
|
confidence=0.9,
|
|
parent_id=root_id,
|
|
source_ref=source_ref,
|
|
)
|
|
)
|
|
relations.append(
|
|
SemanticRelation(from_id=root_id, to_id=e_id, relation_type=RelationType.LOGICAL)
|
|
)
|
|
|
|
# If not atomic and depth allows, split and recurse
|
|
if not _is_atomic(content, min_words=8) and current_depth < max_depth:
|
|
sentences = re.split(r"[.!?]\s+", content)
|
|
if len(sentences) > 1:
|
|
for sent in sentences:
|
|
sent = sent.strip()
|
|
if len(sent) > 20:
|
|
sub = decompose_recursive(
|
|
sent,
|
|
max_depth=max_depth,
|
|
parent_id=root_id,
|
|
current_depth=current_depth + 1,
|
|
source_ref=source_ref,
|
|
)
|
|
units.extend(sub.units)
|
|
relations.extend(sub.relations)
|
|
|
|
# Dedupe by unit_id
|
|
seen: set[str] = set()
|
|
unique_units: list[AtomicSemanticUnit] = []
|
|
for u in units:
|
|
if u.unit_id not in seen:
|
|
seen.add(u.unit_id)
|
|
unique_units.append(u)
|
|
|
|
logger.debug(
|
|
"Decomposition complete",
|
|
extra={"depth": current_depth, "units": len(unique_units), "relations": len(relations)},
|
|
)
|
|
|
|
return DecompositionResult(
|
|
units=unique_units,
|
|
relations=relations,
|
|
depth=current_depth,
|
|
)
|