Source code for semnet.semnet

import logging
from typing import Dict, List, Literal, Optional, Tuple

import networkx as nx
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from tqdm.auto import tqdm

logger = logging.getLogger(__name__)

MetricType = Literal["angular", "euclidean", "manhattan", "hamming", "dot"]


[docs] def to_pandas(graph: nx.Graph) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Export a NetworkX graph to pandas DataFrames. This function operates on any NetworkX graph, making it useful for analyzing graphs from SemanticNetwork or any other NetworkX graph. Args: graph: NetworkX graph to export Returns: Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing: - nodes (pd.DataFrame): Node attributes with index as node ID. Columns include all node attributes from the graph. - edges (pd.DataFrame): Edge list with columns 'source', 'target', and any edge attributes (e.g., 'weight'). Examples: >>> # Export any NetworkX graph >>> import networkx as nx >>> from semnet import to_pandas >>> >>> # Create or load any graph >>> G = nx.karate_club_graph() >>> nodes, edges = to_pandas(G) >>> # Use with SemanticNetwork >>> network = SemanticNetwork(thresh=0.8) >>> graph = network.fit_transform(embeddings, labels=docs) >>> nodes, edges = to_pandas(graph) >>> # Export a subgraph >>> subgraph = graph.subgraph([0, 1, 2]) >>> sub_nodes, sub_edges = to_pandas(subgraph) """ # Convert nodes to DataFrame node_list = [] for node, data in graph.nodes(data=True): node_data = {"node_id": node} node_data.update(data) node_list.append(node_data) nodes = pd.DataFrame(node_list) # Convert edges to DataFrame if graph.number_of_edges() > 0: edges = nx.to_pandas_edgelist(graph) else: # Create empty DataFrame with expected columns if no edges edges = pd.DataFrame(columns=["source", "target"]) return nodes, edges
[docs] class SemanticNetwork: """ A semantic network builder for creating graphs from document embeddings. This class follows the scikit-learn pattern with fit() and transform() methods. Users must provide pre-computed embeddings during the fit process. The fitting process builds an approximate nearest neighbor index from embeddings. The transformation process constructs a graph where edges represent semantic similarity. Key Methods: fit(): Build the similarity index from provided embeddings transform(): Construct and return a networkx object fit_transform(): Combined fit and transform in one step to_pandas(): Export graph structure to pandas DataFrames for analysis Attributes: metric: Distance metric for the Annoy index n_trees: Number of trees for the Annoy index thresh: Similarity threshold for connecting documents top_k: Maximum neighbors to check per document verbose: Whether to show progress bars and detailed logging is_fitted_: Whether the model has been fitted embeddings_: Document embeddings array (available after fitting) index_: Annoy index for similarity search (available after fitting) """ def __init__( self, metric: MetricType = "angular", n_trees: int = 10, thresh: float = 0.3, top_k: int = 20, search_k: Optional[int] = None, verbose: bool = False, ) -> None: """ Initialize the SemanticNetwork. Args: metric: Distance metric for the Annoy index. Options: - 'angular': Cosine distance (1 - cosine similarity). Best for normalized embeddings like those from sentence transformers. Default and recommended. - 'euclidean': L2 distance. Good for embeddings where magnitude matters. - 'manhattan': L1 distance. Less common, useful for sparse data. - 'hamming': Hamming distance for binary vectors. - 'dot': Negative dot product. Use with caution. n_trees: Number of trees in the Annoy index forest. More trees = higher accuracy but slower build time and larger memory usage. Typical range: 10-100. - 10 (default): Good balance for most applications - 50+: Better accuracy for large datasets (100k+ documents) - 100+: Highest accuracy for production systems thresh: Similarity threshold for edge creation (0.0 to 1.0). Documents with similarity >= thresh will be connected by an edge. Higher values create sparser, more selective networks: - 0.1-0.2: Dense networks, captures weak similarities - 0.2-0.4: Moderate selectivity, good for most applications - 0.5+: Very selective, only strong similarities top_k: Maximum number of nearest neighbors to examine per document when building the network. Higher values find more potential connections but increase computation time. Should be >> expected degree: - 10-20: Good starting point for most applications (default: 20) - 20-50: More comprehensive search for medium datasets - 50-100: Thorough search for large or sparse datasets - 100+: Use only when seeking maximum connectivity search_k: Optional parameter for Annoy index search_k, controlling the number of nodes to inspect during search. If None, uses the default from Annoy. verbose: Whether to show progress bars and detailed logging during fit/transform """ self.metric = metric self.n_trees = n_trees self.thresh = thresh self.top_k = top_k self.search_k = search_k self.verbose = verbose # Fitted state self.is_fitted_ = False self.embeddings_: Optional[np.ndarray] = None self.index_: Optional[AnnoyIndex] = None # Training data (stored during fit) self._labels: Optional[List[str]] = None self._node_data: Optional[Dict] = None
[docs] def fit( self, embeddings: np.ndarray, ) -> "SemanticNetwork": """ Build the index from document embeddings. This method uses provided embeddings to create an Annoy index for fast nearest neighbor search. Args: embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim). labels: Optional list of text labels/documents for the embeddings. If not provided, will use string indices as labels. node_data: Optional dictionary containing additional data to attach to nodes. Format: {node_index: {attribute_name: value, ...}, ...} OR {node_index: single_value, ...} (will be stored as {'value': single_value}) Only nodes present in the dictionary will get additional attributes. Returns: self: Returns the fitted estimator Raises: ValueError: If labels provided but length doesn't match embeddings ValueError: If ids provided but length doesn't match embeddings ValueError: If node_data values don't match embeddings length """ self.embeddings_ = embeddings if self.verbose: logger.info(f"Using provided embeddings with shape: {self.embeddings_.shape}") logger.info(f"Fitting SemanticNetwork on {len(embeddings)} documents") # Build the vector index self._build_vector_index() self.is_fitted_ = True if self.verbose: logger.info("Fitting complete") return self
[docs] def transform( self, thresh: Optional[float] = None, top_k: Optional[int] = None, search_k: Optional[int] = None, labels: Optional[List[str]] = None, node_data: Optional[Dict] = None, ) -> nx.Graph: """ Build and return a weighted graph from the fitted embeddings. Args: thresh: The similarity threshold for edge inclusion. If None, uses the threshold from initialization. top_k: Optional max neighbors override for this transform. If None, uses the top_k from initialization. search_k: Optional parameter for Annoy index search_k, controlling the number of nodes to inspect during search. If None, uses the search_k from initialization. labels: Optional list of text labels/documents for the embeddings. If not provided, will use string indices as labels. node_data: Optional dictionary containing additional data to attach to nodes. Format: {node_index: {attribute_name: value, ...}, ...} OR {node_index: single_value, ...} (will be stored as {'value': single_value}) Only nodes present in the dictionary will get additional attributes. Returns: NetworkX graph where nodes represent documents and edges represent similarities above the threshold. Raises: ValueError: If the model hasn't been fitted yet """ if not self.is_fitted_: raise ValueError("This SemanticNetwork instance is not fitted yet. Call 'fit' first.") if self.embeddings_ is None: raise ValueError("Embeddings not available. This should not happen after fitting.") n_docs = self.embeddings_.shape[0] if labels is not None and len(labels) != n_docs: raise ValueError( f"Labels length ({len(labels)}) must match embeddings length ({n_docs})" ) if node_data is not None: # Validate node_data format: should be {node_index: {attribute_dict}} or {node_index: value} if not isinstance(node_data, dict): raise ValueError("Node data must be a dictionary") # Check if all keys are integers (node indices) non_integer_keys = [k for k in node_data.keys() if not isinstance(k, (int, np.integer))] if non_integer_keys: raise ValueError( f"Node data keys must be integer node indices, got: {non_integer_keys}" ) # Validate that node_data keys are valid node indices invalid_indices = [idx for idx in node_data.keys() if idx >= n_docs or idx < 0] if invalid_indices: raise ValueError( f"Node data contains invalid indices {invalid_indices}. Indices must be 0 <= idx < {n_docs}" ) # Convert single values to dictionary format for consistency # If values are not dictionaries, wrap them in a dictionary with 'value' key processed_node_data = {} for k, v in node_data.items(): if isinstance(v, dict): processed_node_data[k] = v else: processed_node_data[k] = {"value": v} node_data = processed_node_data # Store training data self._labels = labels if labels is not None else [str(i) for i in range(n_docs)] self._node_data = node_data # Use provided thresholds or fall back to instance defaults effective_thresh = thresh if thresh is not None else self.thresh effective_top_k = top_k if top_k is not None else self.top_k effective_search_k = search_k if search_k is not None else self.search_k # Get pairwise similarities neighbor_data = self._get_pairwise_similarities( effective_thresh, effective_top_k, effective_search_k ) # Build and return the graph return self._build_graph(neighbor_data)
[docs] def fit_transform( self, embeddings: np.ndarray, labels: Optional[List[str]] = None, node_data: Optional[Dict] = None, thresh: Optional[float] = None, top_k: Optional[int] = None, ) -> nx.Graph: """ Fit the model and transform the embeddings in one step. Args: embeddings: Pre-computed embeddings array with shape (n_docs, embedding_dim). labels: Optional list of text labels/documents for the embeddings. node_data: Optional dictionary containing additional data to attach to nodes. thresh: Optional similarity threshold override for this transform. top_k: Optional max neighbors override for this transform. Returns: NetworkX graph representing the semantic network """ return self.fit(embeddings=embeddings).transform( thresh=thresh, top_k=top_k, search_k=None, labels=labels, node_data=node_data )
def _build_vector_index(self) -> AnnoyIndex: """ Build an Annoy index for fast approximate nearest neighbor search. Returns: The built Annoy index Raises: ValueError: If embeddings haven't been provided yet Note: The index is stored in self.index_ and also returned. """ if self.embeddings_ is None: raise ValueError("Embeddings not found. Please provide embeddings in fit() method.") embedding_dim = self.embeddings_.shape[1] self.index_ = AnnoyIndex(embedding_dim, self.metric) # type: ignore if self.verbose: logger.info( f"Building Annoy index with {self.n_trees} trees for {len(self.embeddings_)} embeddings" ) iterator = tqdm( enumerate(self.embeddings_), total=len(self.embeddings_), desc="Adding embeddings to index", ) else: iterator = enumerate(self.embeddings_) for i, embedding_vector in iterator: self.index_.add_item(i, embedding_vector) if self.verbose: logger.info("Building index trees...") self.index_.build(self.n_trees) if self.verbose: logger.info("Vector index built successfully") return self.index_ def _get_pairwise_similarities( self, thresh: float, top_k: int, search_k: Optional[int] = None ) -> pd.DataFrame: """ Find pairwise similarities between documents above a threshold. Uses the Annoy index to efficiently find nearest neighbors for each document, then calculates exact similarities and filters by threshold. Args: thresh: Similarity threshold for including edges top_k: Maximum number of neighbors to check per document search_k: Optional parameter for Annoy index search_k, controlling the number of nodes to inspect during search Returns: DataFrame of similarities with columns: source_idx, target_idx, weight, source_name, target_name Raises: ValueError: If embeddings or index haven't been built yet """ if self.embeddings_ is None or self.index_ is None: raise ValueError( "Embeddings or index not found. Please provide embeddings in fit() method and run _build_vector_index() first." ) if self._labels is None: raise ValueError("No training documents found. Call fit() first.") if self.verbose: logger.info( f"Finding pairwise similarities with threshold {thresh}, checking top {top_k} neighbors" ) results = [] if self.verbose: iterator = tqdm(range(len(self.embeddings_)), desc="Finding similarities") else: iterator = range(len(self.embeddings_)) for idx_source in iterator: # Dealing with self loops and too large top_k # We add one to top_k to account for self-matches, so top_k = 1 will return 1 neighbor + self effective_top_k = top_k + 1 # In case top_k exceeds number of items, set to -1 to get all items if effective_top_k > len(self.embeddings_): effective_top_k = len(self.embeddings_) # Only pass search_k if explicitly set (annoy doesn't accept None) if search_k is not None: neighbors = self.index_.get_nns_by_item( idx_source, effective_top_k, search_k=search_k, include_distances=True ) else: neighbors = self.index_.get_nns_by_item( idx_source, effective_top_k, include_distances=True ) # Reduce neighbours to exclude self-match neighbors = ( np.array(neighbors[0][1:]), np.array(neighbors[1][1:]), ) for idx_target, dist in zip(*neighbors): # Convert distance to similarity based on metric if self.metric == "angular": # angular_dist = sqrt(2 * (1 - cos_sim)) # Therefore: cos_sim = 1 - (dist^2 / 2) similarity = 1 - (dist**2) / 2 elif self.metric == "dot": # Annoy stores negative dot product similarity = -dist elif self.metric == "hamming": # Hamming distance is already in [0, 1] similarity = 1 - dist else: # euclidean, manhattan # No direct cosine relationship; use common heuristic similarity = 1 / (1 + dist) # Only include if above threshold if similarity >= thresh: result_dict = { "source_idx": idx_source, "target_idx": idx_target, "weight": similarity, "source_name": self._labels[idx_source], "target_name": self._labels[idx_target], } results.append(result_dict) neighbor_data = pd.DataFrame(results) if self.verbose: logger.info(f"Found {len(neighbor_data)} similarity pairs above threshold {thresh}") return neighbor_data def _build_graph(self, neighbor_data: pd.DataFrame) -> nx.Graph: """ Build a NetworkX graph from pairwise similarities. Creates a graph where: - Nodes represent documents - Edges represent similarities above the threshold (with 'weight' attribute representing similarity) Args: neighbor_data: DataFrame of pairwise similarities Returns: The constructed NetworkX graph Raises: ValueError: If training data hasn't been set Note: The graph includes all documents as nodes, even if they have no similarities above threshold. """ if self._labels is None: raise ValueError("No training documents found. Call fit() first.") if self.verbose: logger.info(f"Building graph from {len(neighbor_data)} similarity edges") # Instantiate undirected graph G = nx.Graph() # Add all nodes with their attributes for i in range(len(self._labels)): # Set basic attributes attrs = { "label": self._labels[i], } # Add custom node data if provided for this specific node if self._node_data is not None and i in self._node_data: attrs.update(self._node_data[i]) G.add_node(i, **attrs) # Add edges from neighbor data for _, row in neighbor_data.iterrows(): G.add_edge( row["source_idx"], row["target_idx"], weight=row["weight"], ) if self.verbose: num_components = nx.number_connected_components(G) logger.info( f"Built graph with {G.number_of_nodes()} nodes, {G.number_of_edges()} edges, {num_components} components" ) return G