Initial SFERA platform baseline

2026-05-16 19:03:49 +03:00
commit 3b845c8fce
282 changed files with 55045 additions and 0 deletions
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from collections import defaultdict
+
+from pydantic import BaseModel, Field
+
+from sir import EdgeKind, NodeKind, SemanticNode, SirSnapshot
+
+
+class SemanticPattern(BaseModel):
+    pattern_id: str
+    kind: str
+    title: str
+    participants: list[SemanticNode] = Field(default_factory=list)
+    targets: list[SemanticNode] = Field(default_factory=list)
+    support: int
+    attributes: dict = Field(default_factory=dict)
+
+
+def mine_patterns(snapshot: SirSnapshot, *, min_support: int = 2) -> list[SemanticPattern]:
+    nodes = {node.lineage_id: node for node in snapshot.nodes}
+    patterns: list[SemanticPattern] = []
+    patterns.extend(_table_usage_patterns(snapshot, nodes, min_support=min_support))
+    patterns.extend(_routine_read_write_patterns(snapshot, nodes, min_support=min_support))
+    return sorted(patterns, key=lambda item: (item.kind, -item.support, item.title))
+
+
+def _table_usage_patterns(
+    snapshot: SirSnapshot,
+    nodes: dict[str, SemanticNode],
+    *,
+    min_support: int,
+) -> list[SemanticPattern]:
+    query_owner: dict[str, SemanticNode] = {}
+    for edge in snapshot.edges:
+        if edge.kind == EdgeKind.OWNS_QUERY and edge.source_lineage in nodes:
+            query_owner[edge.target_lineage] = nodes[edge.source_lineage]
+
+    readers_by_table: dict[str, dict[str, SemanticNode]] = defaultdict(dict)
+    writers_by_table: dict[str, dict[str, SemanticNode]] = defaultdict(dict)
+    for edge in snapshot.edges:
+        if edge.kind == EdgeKind.READS_TABLE:
+            table = nodes.get(edge.target_lineage)
+            owner = query_owner.get(edge.source_lineage)
+            if table is not None and owner is not None:
+                readers_by_table[table.lineage_id][owner.lineage_id] = owner
+        elif edge.kind == EdgeKind.WRITES:
+            table = nodes.get(edge.target_lineage)
+            writer = nodes.get(edge.source_lineage)
+            if table is not None and writer is not None:
+                writers_by_table[table.lineage_id][writer.lineage_id] = writer
+
+    patterns: list[SemanticPattern] = []
+    for table_lineage, readers in readers_by_table.items():
+        table = nodes[table_lineage]
+        if len(readers) >= min_support:
+            patterns.append(
+                SemanticPattern(
+                    pattern_id=f"pattern.repeated_read.{table.lineage_id}",
+                    kind="REPEATED_TABLE_READ",
+                    title=f"Repeated reads of {table.qualified_name}",
+                    participants=sorted(readers.values(), key=lambda node: node.qualified_name),
+                    targets=[table],
+                    support=len(readers),
+                )
+            )
+    for table_lineage, writers in writers_by_table.items():
+        table = nodes[table_lineage]
+        if len(writers) >= min_support:
+            patterns.append(
+                SemanticPattern(
+                    pattern_id=f"pattern.repeated_write.{table.lineage_id}",
+                    kind="REPEATED_TABLE_WRITE",
+                    title=f"Repeated writes to {table.qualified_name}",
+                    participants=sorted(writers.values(), key=lambda node: node.qualified_name),
+                    targets=[table],
+                    support=len(writers),
+                )
+            )
+    return patterns
+
+
+def _routine_read_write_patterns(
+    snapshot: SirSnapshot,
+    nodes: dict[str, SemanticNode],
+    *,
+    min_support: int,
+) -> list[SemanticPattern]:
+    query_owner: dict[str, str] = {}
+    for edge in snapshot.edges:
+        if edge.kind == EdgeKind.OWNS_QUERY:
+            query_owner[edge.target_lineage] = edge.source_lineage
+
+    reads_by_routine: dict[str, set[str]] = defaultdict(set)
+    writes_by_routine: dict[str, set[str]] = defaultdict(set)
+    for edge in snapshot.edges:
+        if edge.kind == EdgeKind.READS_TABLE and edge.source_lineage in query_owner:
+            reads_by_routine[query_owner[edge.source_lineage]].add(edge.target_lineage)
+        elif edge.kind == EdgeKind.WRITES:
+            writes_by_routine[edge.source_lineage].add(edge.target_lineage)
+
+    grouped: dict[tuple[tuple[str, ...], tuple[str, ...]], list[SemanticNode]] = defaultdict(list)
+    routine_kinds = {NodeKind.PROCEDURE, NodeKind.FUNCTION}
+    for routine_lineage in sorted(set(reads_by_routine) | set(writes_by_routine)):
+        routine = nodes.get(routine_lineage)
+        if routine is None or routine.kind not in routine_kinds:
+            continue
+        key = (tuple(sorted(reads_by_routine[routine_lineage])), tuple(sorted(writes_by_routine[routine_lineage])))
+        if key == ((), ()):
+            continue
+        grouped[key].append(routine)
+
+    patterns: list[SemanticPattern] = []
+    for index, ((reads, writes), routines) in enumerate(sorted(grouped.items()), start=1):
+        if len(routines) < min_support:
+            continue
+        targets = [nodes[lineage] for lineage in [*reads, *writes] if lineage in nodes]
+        patterns.append(
+            SemanticPattern(
+                pattern_id=f"pattern.routine_io.{index}",
+                kind="REPEATED_ROUTINE_IO",
+                title="Repeated routine read/write shape",
+                participants=sorted(routines, key=lambda node: node.qualified_name),
+                targets=sorted(targets, key=lambda node: node.qualified_name),
+                support=len(routines),
+                attributes={"read_count": len(reads), "write_count": len(writes)},
+            )
+        )
+    return patterns
+
+
+__all__ = ["SemanticPattern", "mine_patterns"]