from __future__ import annotations from collections import defaultdict from pydantic import BaseModel, Field from sir import EdgeKind, NodeKind, SemanticNode, SirSnapshot class SemanticPattern(BaseModel): pattern_id: str kind: str title: str participants: list[SemanticNode] = Field(default_factory=list) targets: list[SemanticNode] = Field(default_factory=list) support: int attributes: dict = Field(default_factory=dict) def mine_patterns(snapshot: SirSnapshot, *, min_support: int = 2) -> list[SemanticPattern]: nodes = {node.lineage_id: node for node in snapshot.nodes} patterns: list[SemanticPattern] = [] patterns.extend(_table_usage_patterns(snapshot, nodes, min_support=min_support)) patterns.extend(_routine_read_write_patterns(snapshot, nodes, min_support=min_support)) return sorted(patterns, key=lambda item: (item.kind, -item.support, item.title)) def _table_usage_patterns( snapshot: SirSnapshot, nodes: dict[str, SemanticNode], *, min_support: int, ) -> list[SemanticPattern]: query_owner: dict[str, SemanticNode] = {} for edge in snapshot.edges: if edge.kind == EdgeKind.OWNS_QUERY and edge.source_lineage in nodes: query_owner[edge.target_lineage] = nodes[edge.source_lineage] readers_by_table: dict[str, dict[str, SemanticNode]] = defaultdict(dict) writers_by_table: dict[str, dict[str, SemanticNode]] = defaultdict(dict) for edge in snapshot.edges: if edge.kind == EdgeKind.READS_TABLE: table = nodes.get(edge.target_lineage) owner = query_owner.get(edge.source_lineage) if table is not None and owner is not None: readers_by_table[table.lineage_id][owner.lineage_id] = owner elif edge.kind == EdgeKind.WRITES: table = nodes.get(edge.target_lineage) writer = nodes.get(edge.source_lineage) if table is not None and writer is not None: writers_by_table[table.lineage_id][writer.lineage_id] = writer patterns: list[SemanticPattern] = [] for table_lineage, readers in readers_by_table.items(): table = nodes[table_lineage] if len(readers) >= min_support: patterns.append( SemanticPattern( pattern_id=f"pattern.repeated_read.{table.lineage_id}", kind="REPEATED_TABLE_READ", title=f"Repeated reads of {table.qualified_name}", participants=sorted(readers.values(), key=lambda node: node.qualified_name), targets=[table], support=len(readers), ) ) for table_lineage, writers in writers_by_table.items(): table = nodes[table_lineage] if len(writers) >= min_support: patterns.append( SemanticPattern( pattern_id=f"pattern.repeated_write.{table.lineage_id}", kind="REPEATED_TABLE_WRITE", title=f"Repeated writes to {table.qualified_name}", participants=sorted(writers.values(), key=lambda node: node.qualified_name), targets=[table], support=len(writers), ) ) return patterns def _routine_read_write_patterns( snapshot: SirSnapshot, nodes: dict[str, SemanticNode], *, min_support: int, ) -> list[SemanticPattern]: query_owner: dict[str, str] = {} for edge in snapshot.edges: if edge.kind == EdgeKind.OWNS_QUERY: query_owner[edge.target_lineage] = edge.source_lineage reads_by_routine: dict[str, set[str]] = defaultdict(set) writes_by_routine: dict[str, set[str]] = defaultdict(set) for edge in snapshot.edges: if edge.kind == EdgeKind.READS_TABLE and edge.source_lineage in query_owner: reads_by_routine[query_owner[edge.source_lineage]].add(edge.target_lineage) elif edge.kind == EdgeKind.WRITES: writes_by_routine[edge.source_lineage].add(edge.target_lineage) grouped: dict[tuple[tuple[str, ...], tuple[str, ...]], list[SemanticNode]] = defaultdict(list) routine_kinds = {NodeKind.PROCEDURE, NodeKind.FUNCTION} for routine_lineage in sorted(set(reads_by_routine) | set(writes_by_routine)): routine = nodes.get(routine_lineage) if routine is None or routine.kind not in routine_kinds: continue key = (tuple(sorted(reads_by_routine[routine_lineage])), tuple(sorted(writes_by_routine[routine_lineage]))) if key == ((), ()): continue grouped[key].append(routine) patterns: list[SemanticPattern] = [] for index, ((reads, writes), routines) in enumerate(sorted(grouped.items()), start=1): if len(routines) < min_support: continue targets = [nodes[lineage] for lineage in [*reads, *writes] if lineage in nodes] patterns.append( SemanticPattern( pattern_id=f"pattern.routine_io.{index}", kind="REPEATED_ROUTINE_IO", title="Repeated routine read/write shape", participants=sorted(routines, key=lambda node: node.qualified_name), targets=sorted(targets, key=lambda node: node.qualified_name), support=len(routines), attributes={"read_count": len(reads), "write_count": len(writes)}, ) ) return patterns __all__ = ["SemanticPattern", "mine_patterns"]