From 386fa714bf6afe31f97d07c09a41dfe5f86eedc2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 08:07:21 +0000 Subject: [PATCH 01/14] Initial plan From 9b1748e6f270ac7409e7199e7955363e0b00cabe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Feb 2026 08:08:45 +0000 Subject: [PATCH 02/14] refactor(test): rename config var to logging_cfg in pytest_configure to avoid shadowing Co-authored-by: gkostkowski <12532923+gkostkowski@users.noreply.github.com> --- test/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 7553ee9..8f3de23 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -29,8 +29,8 @@ def pytest_configure(config: pytest.Config): # Setup logging from YAML config file cfg_path = os.path.join(os.path.dirname(__file__), "resources/logging-test.yml") with open(cfg_path) as f: - config = yaml.safe_load(f) - logging.config.dictConfig(config) + logging_cfg = yaml.safe_load(f) + logging.config.dictConfig(logging_cfg) # ============================================================================ From cd3e1d1ed7fcb6b59e3506e43819600cd9704cc8 Mon Sep 17 00:00:00 2001 From: Grzegorz Kostkowski Date: Thu, 5 Mar 2026 10:26:53 +0100 Subject: [PATCH 03/14] chore: reformat code with Ruff --- src/ere/adapters/duckdb_repositories.py | 8 +- src/ere/adapters/rdf_mapper.py | 4 +- src/ere/adapters/rdf_mapper_impl.py | 15 +- src/ere/adapters/repositories.py | 8 +- src/ere/adapters/splink_linker_impl.py | 78 +++++---- src/ere/adapters/utils.py | 10 +- src/ere/entrypoints/app.py | 4 +- src/ere/entrypoints/queue_worker.py | 8 +- src/ere/models/exceptions.py | 4 +- src/ere/models/resolver/mention.py | 6 +- src/ere/services/entity_resolution_service.py | 28 +++- src/ere/services/factories.py | 12 +- src/ere/services/resolver_config.py | 4 +- test/conftest.py | 1 + test/e2e/test_ere.py | 4 +- .../test_direct_service_resolution_steps.py | 148 +++++++++++++++--- .../test_entity_resolution_algorithm_steps.py | 24 ++- test/integration/test_entity_resolver.py | 48 ++++-- test/integration/test_redis_integration.py | 20 ++- test/stress/stress_test.py | 5 +- test/unit/adapters/stubs.py | 6 +- test/unit/adapters/test_duckdb_adapters.py | 30 ++-- .../test_entity_resolution_service.py | 52 +++--- 23 files changed, 378 insertions(+), 149 deletions(-) diff --git a/src/ere/adapters/duckdb_repositories.py b/src/ere/adapters/duckdb_repositories.py index 7b58ad9..c65caa8 100644 --- a/src/ere/adapters/duckdb_repositories.py +++ b/src/ere/adapters/duckdb_repositories.py @@ -3,7 +3,13 @@ import duckdb import pandas as pd -from ere.models.resolver import ClusterId, ClusterMembership, Mention, MentionId, MentionLink +from ere.models.resolver import ( + ClusterId, + ClusterMembership, + Mention, + MentionId, + MentionLink, +) from ere.adapters.repositories import ( ClusterRepository, MentionRepository, diff --git a/src/ere/adapters/rdf_mapper.py b/src/ere/adapters/rdf_mapper.py index 37bf6bb..1f45fc1 100644 --- a/src/ere/adapters/rdf_mapper.py +++ b/src/ere/adapters/rdf_mapper.py @@ -87,9 +87,7 @@ def extract_mention_attributes( entity_subject = graph.value(predicate=RDF.type, object=rdf_type) if entity_subject is None: - raise ValueError( - f"No entity of type {rdf_type} found in RDF content" - ) + raise ValueError(f"No entity of type {rdf_type} found in RDF content") # Extract attributes per config attributes = {} diff --git a/src/ere/adapters/rdf_mapper_impl.py b/src/ere/adapters/rdf_mapper_impl.py index 243f6b1..e18d6a3 100644 --- a/src/ere/adapters/rdf_mapper_impl.py +++ b/src/ere/adapters/rdf_mapper_impl.py @@ -42,7 +42,12 @@ def _load_mappings(rdf_mapping_path: str | Path = None) -> dict: dict: Entity type mappings from config. """ if rdf_mapping_path is None: - rdf_mapping_path = Path(__file__).parent.parent.parent.parent / "infra" / "config" / "rdf_mapping.yaml" + rdf_mapping_path = ( + Path(__file__).parent.parent.parent.parent + / "infra" + / "config" + / "rdf_mapping.yaml" + ) else: rdf_mapping_path = Path(rdf_mapping_path) return load_entity_mappings(rdf_mapping_path) @@ -70,9 +75,13 @@ def map_entity_mention_to_domain(self, entity_mention: EntityMention) -> Mention ) mention_id = MentionId( - value=self._derive_mention_id(eid.source_id, eid.request_id, eid.entity_type) + value=self._derive_mention_id( + eid.source_id, eid.request_id, eid.entity_type + ) + ) + attributes = extract_mention_attributes( + entity_mention.content, entity_type_config ) - attributes = extract_mention_attributes(entity_mention.content, entity_type_config) return Mention(id=mention_id, attributes=attributes) @staticmethod diff --git a/src/ere/adapters/repositories.py b/src/ere/adapters/repositories.py index 6ac6dc2..2a99e6d 100644 --- a/src/ere/adapters/repositories.py +++ b/src/ere/adapters/repositories.py @@ -9,7 +9,13 @@ from abc import ABC, abstractmethod -from ere.models.resolver import ClusterId, ClusterMembership, Mention, MentionId, MentionLink +from ere.models.resolver import ( + ClusterId, + ClusterMembership, + Mention, + MentionId, + MentionLink, +) class MentionRepository(ABC): diff --git a/src/ere/adapters/splink_linker_impl.py b/src/ere/adapters/splink_linker_impl.py index 8172614..283432a 100644 --- a/src/ere/adapters/splink_linker_impl.py +++ b/src/ere/adapters/splink_linker_impl.py @@ -45,7 +45,9 @@ def build_tf_df(mentions: list[Mention], entity_fields: list[str]) -> pd.DataFra flat_dict = mention.to_flat_dict() row = { "mention_id": flat_dict["mention_id"], - **{f: flat_dict.get(f) or "" for f in entity_fields}, # Convert None to empty string + **{ + f: flat_dict.get(f) or "" for f in entity_fields + }, # Convert None to empty string "__splink_salt": 0.5, } rows.append(row) @@ -246,11 +248,15 @@ def register_mention(self, mention: Mention) -> None: ) # Build new row with same schema as _tf_df - new_row = pd.DataFrame([{ - "mention_id": flat_dict["mention_id"], - **{f: flat_dict.get(f) for f in self._entity_fields}, - "__splink_salt": 0.5, - }]) + new_row = pd.DataFrame( + [ + { + "mention_id": flat_dict["mention_id"], + **{f: flat_dict.get(f) for f in self._entity_fields}, + "__splink_salt": 0.5, + } + ] + ) # Cast string columns to pd.StringDtype() to prevent type drift on None values for col in self._entity_fields: @@ -324,7 +330,9 @@ def _build_settings(self) -> SettingsCreator: comp["field"], thresholds, ) - comparisons.append(cl.JaroWinklerAtThresholds(comp["field"], thresholds)) + comparisons.append( + cl.JaroWinklerAtThresholds(comp["field"], thresholds) + ) elif comp["type"] == "exact_match": log.trace( "_build_settings: Adding ExactMatch comparison on field '%s'", @@ -406,7 +414,9 @@ def _train_safe(self) -> None: log.info("EM training: estimating u-probabilities via random sampling") linker_new.training.estimate_u_using_random_sampling(max_pairs=1e6) - log.info("EM training: estimating m-probabilities and lambda via EM algorithm") + log.info( + "EM training: estimating m-probabilities and lambda via EM algorithm" + ) linker_new.training.estimate_parameters_using_expectation_maximisation( self._get_em_training_rule(), estimate_without_term_frequencies=True ) @@ -455,12 +465,16 @@ def _apply_cold_start_params(self) -> None: # Check if cold_start config exists cold_start_cfg = self._config.get("splink", {}).get("cold_start", {}) if not cold_start_cfg: - log.info("Linker initializing: No cold_start config found, using Splink defaults") + log.info( + "Linker initializing: No cold_start config found, using Splink defaults" + ) return comparisons_cfg = cold_start_cfg.get("comparisons", {}) if not comparisons_cfg: - log.info("Linker initializing: No comparisons config in cold_start, using Splink defaults") + log.info( + "Linker initializing: No comparisons config in cold_start, using Splink defaults" + ) return log.info( @@ -475,11 +489,11 @@ def _apply_cold_start_params(self) -> None: for _, comparison in enumerate(self._linker._settings_obj.comparisons): # Get the field name from the comparison field_name = None - if hasattr(comparison, 'output_column_name'): + if hasattr(comparison, "output_column_name"): field_name = comparison.output_column_name - elif hasattr(comparison, '_field_names') and comparison._field_names: + elif hasattr(comparison, "_field_names") and comparison._field_names: field_name = comparison._field_names[0] - # pylint: enable=protected-access + # pylint: enable=protected-access if field_name not in comparisons_cfg: continue @@ -494,8 +508,9 @@ def _apply_cold_start_params(self) -> None: # Collect non-null levels to properly map cold-start probabilities non_null_levels = [ - (i, level) for i, level in enumerate(comparison.comparison_levels) - if not (hasattr(level, 'is_null_level') and level.is_null_level) + (i, level) + for i, level in enumerate(comparison.comparison_levels) + if not (hasattr(level, "is_null_level") and level.is_null_level) ] log.trace( "_apply_cold_start_params: Field '%s' has %d non-null levels: %s", @@ -505,8 +520,8 @@ def _apply_cold_start_params(self) -> None: ) # Apply m-probabilities to non-null levels in order - if 'm_probabilities' in field_cfg: - m_probs = field_cfg['m_probabilities'] + if "m_probabilities" in field_cfg: + m_probs = field_cfg["m_probabilities"] for config_idx, m_prob in enumerate(m_probs): if config_idx < len(non_null_levels): actual_level_idx, level = non_null_levels[config_idx] @@ -528,8 +543,8 @@ def _apply_cold_start_params(self) -> None: ) # Apply u-probabilities to non-null levels in order - if 'u_probabilities' in field_cfg: - u_probs = field_cfg['u_probabilities'] + if "u_probabilities" in field_cfg: + u_probs = field_cfg["u_probabilities"] for config_idx, u_prob in enumerate(u_probs): if config_idx < len(non_null_levels): actual_level_idx, level = non_null_levels[config_idx] @@ -566,7 +581,7 @@ def _log_trained_parameters(self, linker: Linker) -> None: # Get the Fellegi-Sunter prior (lambda) prior = None # pylint: disable=protected-access # Splink exposes no public API for settings introspection - if hasattr(linker._settings_obj, 'probability_two_random_records_match'): + if hasattr(linker._settings_obj, "probability_two_random_records_match"): prior = linker._settings_obj.probability_two_random_records_match log.info( "EM trained parameter: lambda (P(match)) = %.6f", @@ -577,11 +592,11 @@ def _log_trained_parameters(self, linker: Linker) -> None: for comparison in linker._settings_obj.comparisons: # Get field name field_name = None - if hasattr(comparison, 'output_column_name'): + if hasattr(comparison, "output_column_name"): field_name = comparison.output_column_name - elif hasattr(comparison, '_field_names') and comparison._field_names: + elif hasattr(comparison, "_field_names") and comparison._field_names: field_name = comparison._field_names[0] - # pylint: enable=protected-access + # pylint: enable=protected-access if not field_name: continue @@ -593,8 +608,9 @@ def _log_trained_parameters(self, linker: Linker) -> None: # Collect non-null levels non_null_levels = [ - (i, level) for i, level in enumerate(comparison.comparison_levels) - if not (hasattr(level, 'is_null_level') and level.is_null_level) + (i, level) + for i, level in enumerate(comparison.comparison_levels) + if not (hasattr(level, "is_null_level") and level.is_null_level) ] # Log m and u probabilities for each level @@ -605,19 +621,25 @@ def _log_trained_parameters(self, linker: Linker) -> None: trained_u = False # Extract m-probability - if hasattr(level, 'm_probability') and level.m_probability is not None: + if ( + hasattr(level, "m_probability") + and level.m_probability is not None + ): m_prob = level.m_probability # Check if it was trained (non-cold-start values have specific patterns) # Cold-start values are typically set exactly; trained values may vary trained_m = True # Extract u-probability - if hasattr(level, 'u_probability') and level.u_probability is not None: + if ( + hasattr(level, "u_probability") + and level.u_probability is not None + ): u_prob = level.u_probability trained_u = True # Log level details - level_desc = getattr(level, 'label', f"Level {config_idx}") + level_desc = getattr(level, "label", f"Level {config_idx}") m_status = "✓ trained" if trained_m else "✗ cold-start" u_status = "✓ trained" if trained_u else "✗ cold-start" diff --git a/src/ere/adapters/utils.py b/src/ere/adapters/utils.py index 63ad5f9..c1535ae 100644 --- a/src/ere/adapters/utils.py +++ b/src/ere/adapters/utils.py @@ -21,7 +21,10 @@ ) SUPPORTED_REQUEST_CLASSES = { - cls.__name__: cls for cls in [EntityMentionResolutionRequest] # , FullRebuildRequest] # TODO: Add when available + cls.__name__: cls + for cls in [ + EntityMentionResolutionRequest + ] } """ Explicit list of supported Request classes, used in utilities like :meth:`get_request_from_message`. @@ -34,7 +37,10 @@ SUPPORTED_RESPONSE_CLASSES = { cls.__name__: cls - for cls in [EntityMentionResolutionResponse, EREErrorResponse] # , FullRebuildResponse] # TODO: Add when available + for cls in [ + EntityMentionResolutionResponse, + EREErrorResponse, + ] } """ Explicit list of supported Response classes, used in utilities like :meth:`get_response_from_message`. diff --git a/src/ere/entrypoints/app.py b/src/ere/entrypoints/app.py index e2bfd35..e4077db 100644 --- a/src/ere/entrypoints/app.py +++ b/src/ere/entrypoints/app.py @@ -78,7 +78,9 @@ def main() -> None: # Config file paths: CLI takes precedence over environment rdf_mapping_path = args.rdf_mapping_path or os.environ.get("RDF_MAPPING_PATH") - resolver_config_path = args.resolver_config_path or os.environ.get("RESOLVER_CONFIG_PATH") + resolver_config_path = args.resolver_config_path or os.environ.get( + "RESOLVER_CONFIG_PATH" + ) duckdb_path = os.environ.get("DUCKDB_PATH") log.info( diff --git a/src/ere/entrypoints/queue_worker.py b/src/ere/entrypoints/queue_worker.py index 020f18c..e3d435b 100644 --- a/src/ere/entrypoints/queue_worker.py +++ b/src/ere/entrypoints/queue_worker.py @@ -47,7 +47,9 @@ def process_single_message(self) -> bool: Exception: Propagates connection errors. """ # Wait for a request - queue_message = self.redis_client.brpop(self.request_queue, timeout=self.queue_timeout) + queue_message = self.redis_client.brpop( + self.request_queue, timeout=self.queue_timeout + ) if not queue_message: return False # Timeout @@ -88,7 +90,9 @@ def _send_response(self, response: EREResponse) -> None: log.error("Failed to send response: %s", e) @staticmethod - def _build_error_response(error_detail: str, ere_request_id: str = "unknown") -> EREErrorResponse: + def _build_error_response( + error_detail: str, ere_request_id: str = "unknown" + ) -> EREErrorResponse: """Build error response for request processing failures.""" log.error("Building error response: %s", error_detail) return EREErrorResponse( diff --git a/src/ere/models/exceptions.py b/src/ere/models/exceptions.py index 889a82c..2d648d4 100644 --- a/src/ere/models/exceptions.py +++ b/src/ere/models/exceptions.py @@ -4,7 +4,9 @@ class ConflictError(Exception): """Raised when the same mention_id is submitted with different content.""" - def __init__(self, mention_id: str, existing_attributes: dict, incoming_attributes: dict): + def __init__( + self, mention_id: str, existing_attributes: dict, incoming_attributes: dict + ): super().__init__( f"Mention '{mention_id}' was already resolved with different content. " f"Existing: {existing_attributes!r}, Incoming: {incoming_attributes!r}" diff --git a/src/ere/models/resolver/mention.py b/src/ere/models/resolver/mention.py index 71c09ac..4e5cd05 100644 --- a/src/ere/models/resolver/mention.py +++ b/src/ere/models/resolver/mention.py @@ -28,7 +28,11 @@ def _from_flat_dict(cls, raw_input: object) -> object: {"mention_id": "m1", "legal_name": "Acme", "country_code": "US"} and convert to the structured form expected by the model. """ - if isinstance(raw_input, dict) and "mention_id" in raw_input and "id" not in raw_input: + if ( + isinstance(raw_input, dict) + and "mention_id" in raw_input + and "id" not in raw_input + ): return { "id": MentionId(value=raw_input["mention_id"]), "attributes": {k: v for k, v in raw_input.items() if k != "mention_id"}, diff --git a/src/ere/services/entity_resolution_service.py b/src/ere/services/entity_resolution_service.py index dc27376..2bbec9b 100644 --- a/src/ere/services/entity_resolution_service.py +++ b/src/ere/services/entity_resolution_service.py @@ -128,7 +128,9 @@ def resolve(self, mention: Mention) -> ResolutionResult: cluster_id = ClusterId(value=mention.id.value) log.trace("New cluster generated for mention with id=%s", mention.id.value) - self._cluster_repo.save(ClusterMembership(mention_id=mention.id, cluster_id=cluster_id)) + self._cluster_repo.save( + ClusterMembership(mention_id=mention.id, cluster_id=cluster_id) + ) # Log cluster contents after assignment all_memberships = self._cluster_repo.get_all_memberships() @@ -147,7 +149,10 @@ def resolve(self, mention: Mention) -> ResolutionResult: # Trigger auto-training if threshold is reached (non-blocking background thread). count = self._mention_repo.count() - if self._config.auto_train_threshold > 0 and count == self._config.auto_train_threshold: + if ( + self._config.auto_train_threshold > 0 + and count == self._config.auto_train_threshold + ): log.info( "Auto-training triggered: %d mentions reached (threshold=%d). " "Starting background EM training thread. Scoring continues with current parameters.", @@ -155,9 +160,7 @@ def resolve(self, mention: Mention) -> ResolutionResult: self._config.auto_train_threshold, ) threading.Thread( - target=self._linker.train, - daemon=True, - name="linker-training" + target=self._linker.train, daemon=True, name="linker-training" ).start() # Step 5: Return cluster references (non-empty, always top-N). @@ -351,7 +354,9 @@ def resolve_to_result( def resolve_entity_mention( - entity_mention: EntityMention, resolver: EntityResolver = None, mapper: RDFMapper = None + entity_mention: EntityMention, + resolver: EntityResolver = None, + mapper: RDFMapper = None, ) -> ClusterReference: """ Resolve an entity mention to a Cluster (public API - returns top candidate). @@ -454,7 +459,9 @@ def process_request(self, request: ERERequest) -> EREResponse: entity_mention.identifiedBy.request_id, ) - resolution_outcome = resolve_to_result(entity_mention, self._resolver, self._mapper) + resolution_outcome = resolve_to_result( + entity_mention, self._resolver, self._mapper + ) # Log resolution result with candidates candidate_info = [ @@ -482,7 +489,12 @@ def process_request(self, request: ERERequest) -> EREResponse: timestamp=now, ) except Exception as exc: # pylint: disable=broad-exception-caught - log.error("Resolution error for mention %s: %s", request.ere_request_id, exc, exc_info=True) + log.error( + "Resolution error for mention %s: %s", + request.ere_request_id, + exc, + exc_info=True, + ) return EREErrorResponse( ere_request_id=request.ere_request_id, error_type=type(exc).__name__, diff --git a/src/ere/services/factories.py b/src/ere/services/factories.py index 6442ae8..0766616 100644 --- a/src/ere/services/factories.py +++ b/src/ere/services/factories.py @@ -19,7 +19,10 @@ from ere.adapters.duckdb_schema import init_schema from ere.adapters.rdf_mapper_port import RDFMapper from ere.adapters.splink_linker_impl import SpLinkSimilarityLinker -from ere.services.entity_resolution_service import EntityResolver, EntityResolutionService +from ere.services.entity_resolution_service import ( + EntityResolver, + EntityResolutionService, +) from ere.services.resolver_config import ResolverConfig @@ -47,7 +50,12 @@ def build_entity_resolver( Fully-constructed EntityResolver with DuckDB backend and Splink linker. """ if resolver_config_path is None: - config_path = Path(__file__).parent.parent.parent.parent / "infra" / "config" / "resolver.yaml" + config_path = ( + Path(__file__).parent.parent.parent.parent + / "infra" + / "config" + / "resolver.yaml" + ) else: config_path = Path(resolver_config_path) diff --git a/src/ere/services/resolver_config.py b/src/ere/services/resolver_config.py index e3c839f..50b49bb 100644 --- a/src/ere/services/resolver_config.py +++ b/src/ere/services/resolver_config.py @@ -7,7 +7,9 @@ class DuckDBConfig(BaseModel): """DuckDB database configuration.""" type: str = "in-memory" # "in-memory" or "persistent" - path: str = ":memory:" # Database path: ":memory:" for in-memory, file path for persistent + path: str = ( + ":memory:" # Database path: ":memory:" for in-memory, file path for persistent + ) class ResolverConfig(BaseModel): diff --git a/test/conftest.py b/test/conftest.py index 4cdc4d2..1d93d39 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -215,6 +215,7 @@ def rdf_mapper(rdf_mapping_path): # pylint: disable=redefined-outer-name # pyt # Redis fixture # ============================================================================ + @pytest.fixture(scope="module") def redis_client(): """ diff --git a/test/e2e/test_ere.py b/test/e2e/test_ere.py index e5bb5ee..c8d05e5 100644 --- a/test/e2e/test_ere.py +++ b/test/e2e/test_ere.py @@ -141,7 +141,9 @@ def test_single_request_resolution_flow(redis_client, redis_queues, queue_worker redis_client.rpush(request_queue, request_bytes) # 2. Process message using worker - assert queue_worker.process_single_message() is True, "Worker should process message" + assert queue_worker.process_single_message() is True, ( + "Worker should process message" + ) # 3. Verify response in queue result = redis_client.brpop(response_queue, timeout=1) diff --git a/test/features/steps/test_direct_service_resolution_steps.py b/test/features/steps/test_direct_service_resolution_steps.py index 678efea..b39d6b8 100644 --- a/test/features/steps/test_direct_service_resolution_steps.py +++ b/test/features/steps/test_direct_service_resolution_steps.py @@ -2,6 +2,7 @@ Tests resolve_entity_mention(EntityMention) -> ClusterReference directly. """ + import pytest from assertpy import assert_that from erspec.models.core import ClusterReference, EntityMention, EntityMentionIdentifier @@ -41,6 +42,7 @@ def outcome(): # store either "result" or "exception" return {"result": None, "exception": None} + # --------------------------------------------------------------------------- # Background # --------------------------------------------------------------------------- @@ -58,11 +60,23 @@ def fresh_service(entity_resolution_service): @given( - parsers.parse('entity mention "{mention_id}" of type "{entity_type}" was already resolved with content from "{rdf_file_first}"'), + parsers.parse( + 'entity mention "{mention_id}" of type "{entity_type}" was already resolved with content from "{rdf_file_first}"' + ), target_fixture="seed_result", ) -def pre_resolve(mention_id: str, entity_type: str, rdf_file_first: str, entity_resolution_service, rdf_mapper) -> ClusterReference: - return resolve_entity_mention(_make_mention(mention_id, entity_type, load_rdf(rdf_file_first)), entity_resolution_service, rdf_mapper) +def pre_resolve( + mention_id: str, + entity_type: str, + rdf_file_first: str, + entity_resolution_service, + rdf_mapper, +) -> ClusterReference: + return resolve_entity_mention( + _make_mention(mention_id, entity_type, load_rdf(rdf_file_first)), + entity_resolution_service, + rdf_mapper, + ) # --------------------------------------------------------------------------- @@ -71,19 +85,43 @@ def pre_resolve(mention_id: str, entity_type: str, rdf_file_first: str, entity_r @when( - parsers.parse('I resolve the first entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"'), + parsers.parse( + 'I resolve the first entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"' + ), target_fixture="first_result", ) -def resolve_first(mention_id: str, entity_type: str, rdf_file: str, entity_resolution_service, rdf_mapper) -> ClusterReference: - return resolve_entity_mention(_make_mention(mention_id, entity_type, load_rdf(rdf_file)), entity_resolution_service, rdf_mapper) +def resolve_first( + mention_id: str, + entity_type: str, + rdf_file: str, + entity_resolution_service, + rdf_mapper, +) -> ClusterReference: + return resolve_entity_mention( + _make_mention(mention_id, entity_type, load_rdf(rdf_file)), + entity_resolution_service, + rdf_mapper, + ) @when( - parsers.parse('I resolve the second entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"'), + parsers.parse( + 'I resolve the second entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"' + ), target_fixture="second_result", ) -def resolve_second(mention_id: str, entity_type: str, rdf_file: str, entity_resolution_service, rdf_mapper) -> ClusterReference: - return resolve_entity_mention(_make_mention(mention_id, entity_type, load_rdf(rdf_file)), entity_resolution_service, rdf_mapper) +def resolve_second( + mention_id: str, + entity_type: str, + rdf_file: str, + entity_resolution_service, + rdf_mapper, +) -> ClusterReference: + return resolve_entity_mention( + _make_mention(mention_id, entity_type, load_rdf(rdf_file)), + entity_resolution_service, + rdf_mapper, + ) # --------------------------------------------------------------------------- @@ -92,20 +130,40 @@ def resolve_second(mention_id: str, entity_type: str, rdf_file: str, entity_reso @when( - parsers.parse('I resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"'), + parsers.parse( + 'I resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"' + ), target_fixture="first_result", ) -def resolve_mention(mention_id: str, entity_type: str, rdf_file: str, entity_resolution_service, rdf_mapper) -> ClusterReference: +def resolve_mention( + mention_id: str, + entity_type: str, + rdf_file: str, + entity_resolution_service, + rdf_mapper, +) -> ClusterReference: mention = _make_mention(mention_id, entity_type, load_rdf(rdf_file)) return resolve_entity_mention(mention, entity_resolution_service, rdf_mapper) @when( - parsers.parse('I resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}" again'), + parsers.parse( + 'I resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}" again' + ), target_fixture="second_result", ) -def resolve_mention_again(mention_id: str, entity_type: str, rdf_file: str, entity_resolution_service, rdf_mapper) -> ClusterReference: - return resolve_entity_mention(_make_mention(mention_id, entity_type, load_rdf(rdf_file)), entity_resolution_service, rdf_mapper) +def resolve_mention_again( + mention_id: str, + entity_type: str, + rdf_file: str, + entity_resolution_service, + rdf_mapper, +) -> ClusterReference: + return resolve_entity_mention( + _make_mention(mention_id, entity_type, load_rdf(rdf_file)), + entity_resolution_service, + rdf_mapper, + ) # --------------------------------------------------------------------------- @@ -114,12 +172,25 @@ def resolve_mention_again(mention_id: str, entity_type: str, rdf_file: str, enti @when( - parsers.parse('I try to resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"'), + parsers.parse( + 'I try to resolve entity mention "{mention_id}" of type "{entity_type}" with content from "{rdf_file}"' + ), target_fixture="raised_exception", ) -def try_resolve_conflict(mention_id: str, entity_type: str, rdf_file: str, outcome, entity_resolution_service, rdf_mapper) -> Exception | None: +def try_resolve_conflict( + mention_id: str, + entity_type: str, + rdf_file: str, + outcome, + entity_resolution_service, + rdf_mapper, +) -> Exception | None: try: - outcome["result"] = resolve_entity_mention(_make_mention(mention_id, entity_type, load_rdf(rdf_file)), entity_resolution_service, rdf_mapper) + outcome["result"] = resolve_entity_mention( + _make_mention(mention_id, entity_type, load_rdf(rdf_file)), + entity_resolution_service, + rdf_mapper, + ) return None except Exception as exc: outcome["exception"] = exc @@ -128,12 +199,25 @@ def try_resolve_conflict(mention_id: str, entity_type: str, rdf_file: str, outco @when( # parsers.re required: parsers.parse cannot match an empty string for {bad_content} - parsers.re(r'I try to resolve entity mention "(?P[^"]+)" of type "(?P[^"]+)" with invalid content "(?P.*)"'), + parsers.re( + r'I try to resolve entity mention "(?P[^"]+)" of type "(?P[^"]+)" with invalid content "(?P.*)"' + ), target_fixture="raised_exception", ) -def try_resolve_malformed(mention_id: str, entity_type: str, bad_content: str, outcome, entity_resolution_service, rdf_mapper) -> Exception | None: +def try_resolve_malformed( + mention_id: str, + entity_type: str, + bad_content: str, + outcome, + entity_resolution_service, + rdf_mapper, +) -> Exception | None: try: - outcome["result"] = resolve_entity_mention(_make_mention(mention_id, entity_type, bad_content), entity_resolution_service, rdf_mapper) + outcome["result"] = resolve_entity_mention( + _make_mention(mention_id, entity_type, bad_content), + entity_resolution_service, + rdf_mapper, + ) return None except Exception as exc: outcome["exception"] = exc @@ -146,7 +230,9 @@ def try_resolve_malformed(mention_id: str, entity_type: str, bad_content: str, o @then("both results are ClusterReference instances") -def check_cluster_reference_type(first_result: ClusterReference, second_result: ClusterReference): +def check_cluster_reference_type( + first_result: ClusterReference, second_result: ClusterReference +): assert_that(first_result).is_instance_of(ClusterReference) assert_that(second_result).is_instance_of(ClusterReference) @@ -157,12 +243,16 @@ def check_same_cluster(first_result: ClusterReference, second_result: ClusterRef @then("the cluster_ids are different") -def check_different_clusters(first_result: ClusterReference, second_result: ClusterReference): +def check_different_clusters( + first_result: ClusterReference, second_result: ClusterReference +): assert_that(first_result.cluster_id).is_not_equal_to(second_result.cluster_id) @then("both ClusterReference results are identical") -def check_identical_results(first_result: ClusterReference, second_result: ClusterReference): +def check_identical_results( + first_result: ClusterReference, second_result: ClusterReference +): assert_that(first_result).is_equal_to(second_result) assert_that(first_result).is_equal_to(second_result) @@ -183,7 +273,9 @@ def check_exception_raised(outcome): ) elif isinstance(raised_exception, ConflictError): # Conflict errors should contain mention_id and indicate content mismatch - assert_that(str(raised_exception)).contains("was already resolved with different content") + assert_that(str(raised_exception)).contains( + "was already resolved with different content" + ) @then("the result is a ClusterReference") @@ -193,7 +285,9 @@ def check_single_result_type(first_result: ClusterReference): @then("the cluster_id matches the seed cluster") -def check_matches_seed_cluster(first_result: ClusterReference, seed_result: ClusterReference): +def check_matches_seed_cluster( + first_result: ClusterReference, seed_result: ClusterReference +): """Verify new mention joined the pre-established cluster (not a new one).""" assert_that(first_result.cluster_id).is_equal_to(seed_result.cluster_id) @@ -207,4 +301,6 @@ def check_unsupported_entity_type_exception(outcome): f"Result was: {outcome['result']!r}" ) assert_that(raised_exception).is_instance_of(ValueError) - assert_that(str(raised_exception)).matches(r"No rdf_mapping configured for entity_type") + assert_that(str(raised_exception)).matches( + r"No rdf_mapping configured for entity_type" + ) diff --git a/test/features/steps/test_entity_resolution_algorithm_steps.py b/test/features/steps/test_entity_resolution_algorithm_steps.py index 89d0984..c41e42c 100644 --- a/test/features/steps/test_entity_resolution_algorithm_steps.py +++ b/test/features/steps/test_entity_resolution_algorithm_steps.py @@ -81,7 +81,7 @@ def resolve_mention(mention_id: str, algorithm_context): # Create mention mention = Mention( id=MentionId(value=mention_id), - attributes={"legal_name": f"Company {mention_id}", "country_code": "US"} + attributes={"legal_name": f"Company {mention_id}", "country_code": "US"}, ) # Update linker with new similarities @@ -102,7 +102,9 @@ def resolve_mention(mention_id: str, algorithm_context): algorithm_context["last_result"] = result -@when(parsers.parse('I set similarity between "{left_id}" and "{right_id}" to {score:f}')) +@when( + parsers.parse('I set similarity between "{left_id}" and "{right_id}" to {score:f}') +) def set_similarity(left_id: str, right_id: str, score: float, algorithm_context): """Set similarity between two mentions.""" pair_set = frozenset([left_id, right_id]) @@ -114,8 +116,14 @@ def set_similarity(left_id: str, right_id: str, score: float, algorithm_context) # =============================================================================== -@then(parsers.parse('mention "{mention_id}" is in cluster "{cluster_id}" with score {score:f}')) -def check_mention_cluster(mention_id: str, cluster_id: str, score: float, algorithm_context): +@then( + parsers.parse( + 'mention "{mention_id}" is in cluster "{cluster_id}" with score {score:f}' + ) +) +def check_mention_cluster( + mention_id: str, cluster_id: str, score: float, algorithm_context +): """Verify that a mention is assigned to a cluster with the expected score.""" result = algorithm_context["last_result"] assert_that(result.top.cluster_id.value).is_equal_to(cluster_id) @@ -129,7 +137,9 @@ def check_candidate_count(count: int, algorithm_context): assert_that(len(result.candidates)).is_equal_to(count) -@then(parsers.parse('candidate {index:d} is cluster "{cluster_id}" with score {score:f}')) +@then( + parsers.parse('candidate {index:d} is cluster "{cluster_id}" with score {score:f}') +) def check_candidate(index: int, cluster_id: str, score: float, algorithm_context): """Verify a specific candidate cluster and its score.""" result = algorithm_context["last_result"] @@ -139,7 +149,9 @@ def check_candidate(index: int, cluster_id: str, score: float, algorithm_context assert_that(candidate.score).is_close_to(score, 0.01) -@then(parsers.parse('the cluster assignment for mention "{mention_id}" is "{cluster_id}"')) +@then( + parsers.parse('the cluster assignment for mention "{mention_id}" is "{cluster_id}"') +) def check_cluster_assignment(mention_id: str, cluster_id: str, algorithm_context): """Verify the cluster assignment from state.""" service = algorithm_context["service"] diff --git a/test/integration/test_entity_resolver.py b/test/integration/test_entity_resolver.py index 5470190..abf5e7e 100644 --- a/test/integration/test_entity_resolver.py +++ b/test/integration/test_entity_resolver.py @@ -122,7 +122,9 @@ def test_first_mention_resolves_to_singleton(service, con): # Verify persistence mention_count = con.execute("SELECT COUNT(*) FROM mentions").fetchone()[0] assert mention_count == 1 - cluster_count = con.execute("SELECT COUNT(DISTINCT cluster_id) FROM clusters").fetchone()[0] + cluster_count = con.execute( + "SELECT COUNT(DISTINCT cluster_id) FROM clusters" + ).fetchone()[0] assert cluster_count == 1 @@ -169,7 +171,9 @@ def test_below_threshold_creates_new_cluster(service, con): assert mention_count == 2 # Verify cluster assignments persist - cluster_count = con.execute("SELECT COUNT(DISTINCT cluster_id) FROM clusters").fetchone()[0] + cluster_count = con.execute( + "SELECT COUNT(DISTINCT cluster_id) FROM clusters" + ).fetchone()[0] assert cluster_count >= 1 @@ -243,7 +247,9 @@ def test_train_succeeds_with_sufficient_records(service, con): service.train() # Verify linker is still functional - query = Mention(mention_id="test_q", legal_name="Acme Technologies", country_code="US") + query = Mention( + mention_id="test_q", legal_name="Acme Technologies", country_code="US" + ) result = service.resolve(query) assert result.top is not None @@ -436,11 +442,15 @@ def test_multiple_resolves_accumulate_state(service, con): state = service.state() # Verify state accumulates - assert state.mention_count == i, f"After resolving {i} mentions, should have {i} in DB" + assert state.mention_count == i, ( + f"After resolving {i} mentions, should have {i} in DB" + ) # Later mentions should see earlier mentions in results if i > 1: - assert len(result.candidates) >= 1, "Should see candidates from earlier mentions" + assert len(result.candidates) >= 1, ( + "Should see candidates from earlier mentions" + ) @pytest.mark.integration @@ -452,14 +462,22 @@ def test_end_to_end_realistic_scenario(service, con): # Stream of mentions: 3 companies with variants mentions = [ # Company A - Mention(mention_id="acme_1", legal_name="Acme Corporation Ltd", country_code="US"), + Mention( + mention_id="acme_1", legal_name="Acme Corporation Ltd", country_code="US" + ), Mention(mention_id="acme_2", legal_name="Acme Corp", country_code="US"), Mention(mention_id="acme_3", legal_name="Acme", country_code="US"), # Company B - Mention(mention_id="bestco_1", legal_name="BestCo Industries Inc", country_code="US"), + Mention( + mention_id="bestco_1", legal_name="BestCo Industries Inc", country_code="US" + ), Mention(mention_id="bestco_2", legal_name="BestCo Inc", country_code="US"), # Company C - Mention(mention_id="techsoft_1", legal_name="TechSoft Solutions Limited", country_code="US"), + Mention( + mention_id="techsoft_1", + legal_name="TechSoft Solutions Limited", + country_code="US", + ), Mention(mention_id="techsoft_2", legal_name="TechSoft Ltd", country_code="US"), Mention(mention_id="techsoft_3", legal_name="TechSoft", country_code="US"), ] @@ -481,9 +499,14 @@ def test_end_to_end_realistic_scenario(service, con): # Verify all mentions are assigned assert set(mention_to_cluster.keys()) == { - "acme_1", "acme_2", "acme_3", - "bestco_1", "bestco_2", - "techsoft_1", "techsoft_2", "techsoft_3" + "acme_1", + "acme_2", + "acme_3", + "bestco_1", + "bestco_2", + "techsoft_1", + "techsoft_2", + "techsoft_3", }, "All mentions should be assigned to clusters" # Verify different companies are in different clusters @@ -492,5 +515,6 @@ def test_end_to_end_realistic_scenario(service, con): bestco_cluster = mention_to_cluster["bestco_1"] techsoft_cluster = mention_to_cluster["techsoft_1"] - assert len({acme_cluster, bestco_cluster, techsoft_cluster}) == 3, \ + assert len({acme_cluster, bestco_cluster, techsoft_cluster}) == 3, ( "Different companies should be in different clusters" + ) diff --git a/test/integration/test_redis_integration.py b/test/integration/test_redis_integration.py index 2b22234..2b0fac4 100644 --- a/test/integration/test_redis_integration.py +++ b/test/integration/test_redis_integration.py @@ -15,7 +15,9 @@ import pytest -def create_test_request(request_id: str = "test-001", content: str = "John Smith") -> dict: +def create_test_request( + request_id: str = "test-001", content: str = "John Smith" +) -> dict: """Create a valid EntityMentionResolutionRequest for testing.""" return { "type": "EntityMentionResolutionRequest", @@ -80,14 +82,20 @@ def test_receive_response(self, redis_client): if new_response_count == 0: pytest.skip("ERE service not running — skipping response test") - assert new_response_count == 1, f"Expected 1 new response, got {new_response_count}" + assert new_response_count == 1, ( + f"Expected 1 new response, got {new_response_count}" + ) # Retrieve and verify response format (latest response is at index 0) response_raw = redis_client.lindex("ere_responses", 0) assert response_raw is not None, "Response is empty" # response_raw is bytes, decode it - response_str = response_raw.decode("utf-8") if isinstance(response_raw, bytes) else response_raw + response_str = ( + response_raw.decode("utf-8") + if isinstance(response_raw, bytes) + else response_raw + ) response = json.loads(response_str) # Verify response structure @@ -115,7 +123,9 @@ def test_multiple_requests(self, redis_client): if new_response_count == 0: pytest.skip("ERE service not running — skipping response verification") - assert new_response_count == 3, f"Expected 3 new responses, got {new_response_count}" + assert new_response_count == 3, ( + f"Expected 3 new responses, got {new_response_count}" + ) def test_redis_authentication(self, redis_client): """Test: Verify Redis connection works with authentication.""" @@ -140,4 +150,4 @@ def test_malformed_request_handling(self, redis_client): if __name__ == "__main__": """Allow running tests directly: python test/integration/test_redis_integration.py""" - pytest.main([__file__, "-v"]) \ No newline at end of file + pytest.main([__file__, "-v"]) diff --git a/test/stress/stress_test.py b/test/stress/stress_test.py index 16e3db5..80dfb55 100644 --- a/test/stress/stress_test.py +++ b/test/stress/stress_test.py @@ -141,7 +141,10 @@ def create_resolver( def seed_and_train( - resolver: EntityResolver, mentions: list[Mention], n_seed: int, skip_train: bool = False + resolver: EntityResolver, + mentions: list[Mention], + n_seed: int, + skip_train: bool = False, ): """ Seed resolver with first n_seed mentions and optionally trigger training. diff --git a/test/unit/adapters/stubs.py b/test/unit/adapters/stubs.py index 5529b81..2b7741b 100644 --- a/test/unit/adapters/stubs.py +++ b/test/unit/adapters/stubs.py @@ -15,12 +15,14 @@ def _get_repository_types(): """Lazy import to avoid circular dependency with services.__init__.""" from ere.adapters import repositories + return repositories def _get_linker_type(): """Lazy import to avoid circular dependency.""" from ere.services import linker + return linker @@ -95,9 +97,7 @@ def count(self) -> int: def find_for(self, mention_id: MentionId) -> list[MentionLink]: """Find all links involving the given mention (either side).""" return [ - link - for link in self._links - if mention_id in (link.left_id, link.right_id) + link for link in self._links if mention_id in (link.left_id, link.right_id) ] diff --git a/test/unit/adapters/test_duckdb_adapters.py b/test/unit/adapters/test_duckdb_adapters.py index 8087bc9..03f5b79 100644 --- a/test/unit/adapters/test_duckdb_adapters.py +++ b/test/unit/adapters/test_duckdb_adapters.py @@ -80,7 +80,7 @@ def test_resolve_first_mention_persists_to_db(service, con): """ mention = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) result = service.resolve(mention) @@ -89,7 +89,9 @@ def test_resolve_first_mention_persists_to_db(service, con): mention_count = con.execute("SELECT COUNT(*) FROM mentions").fetchone()[0] assert mention_count == 1 - cluster_count = con.execute("SELECT COUNT(DISTINCT cluster_id) FROM clusters").fetchone()[0] + cluster_count = con.execute( + "SELECT COUNT(DISTINCT cluster_id) FROM clusters" + ).fetchone()[0] assert cluster_count == 1 # Check state @@ -109,11 +111,11 @@ def test_resolve_strong_match_joins_cluster_in_db(service, con): """ m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) # Set up linker to return high score @@ -144,11 +146,11 @@ def test_resolve_weak_match_creates_separate_cluster(service, con): """ m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Similar but different", "country_code": "US"} + attributes={"legal_name": "Similar but different", "country_code": "US"}, ) # Linker returns score below clustering threshold (0.8) @@ -179,11 +181,11 @@ def test_resolve_no_match_creates_singleton_cluster(service, con): """ m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Completely Different", "country_code": "UK"} + attributes={"legal_name": "Completely Different", "country_code": "UK"}, ) # No similarity map entry = no match @@ -202,12 +204,10 @@ def test_resolve_no_match_creates_singleton_cluster(service, con): def test_state_returns_correct_counts(service, con): """Verify that service.state() returns accurate counts.""" m1 = Mention( - id=MentionId(value="m1"), - attributes={"legal_name": "A", "country_code": "US"} + id=MentionId(value="m1"), attributes={"legal_name": "A", "country_code": "US"} ) m2 = Mention( - id=MentionId(value="m2"), - attributes={"legal_name": "B", "country_code": "US"} + id=MentionId(value="m2"), attributes={"legal_name": "B", "country_code": "US"} ) service._linker._similarity_map = {frozenset(["m1", "m2"]): 0.9} @@ -224,12 +224,10 @@ def test_state_returns_correct_counts(service, con): def test_cluster_membership_mapping(service, con): """Verify cluster_membership dict is correctly structured.""" m1 = Mention( - id=MentionId(value="m1"), - attributes={"legal_name": "A", "country_code": "US"} + id=MentionId(value="m1"), attributes={"legal_name": "A", "country_code": "US"} ) m2 = Mention( - id=MentionId(value="m2"), - attributes={"legal_name": "B", "country_code": "US"} + id=MentionId(value="m2"), attributes={"legal_name": "B", "country_code": "US"} ) service._linker._similarity_map = {frozenset(["m1", "m2"]): 0.9} diff --git a/test/unit/services/test_entity_resolution_service.py b/test/unit/services/test_entity_resolution_service.py index dfca7d2..0948f06 100644 --- a/test/unit/services/test_entity_resolution_service.py +++ b/test/unit/services/test_entity_resolution_service.py @@ -57,7 +57,7 @@ def test_first_mention_is_singleton(service): """Resolving the first mention should create a singleton cluster.""" mention = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) result = service.resolve(mention) @@ -79,7 +79,7 @@ def test_strong_match_joins_cluster(service): # Resolve m1 first m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) result1 = service.resolve(m1) assert result1.top.cluster_id.value == "m1" @@ -87,7 +87,7 @@ def test_strong_match_joins_cluster(service): # Now resolve m2 with strong match to m1 m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) # Set up the linker to return a strong match (m1, m2, 0.95) @@ -116,14 +116,14 @@ def test_below_threshold_becomes_singleton(service): # Resolve m1 first m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) service.resolve(m1) # Resolve m2 with weak match to m1 m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "ACME Inc", "country_code": "US"} + attributes={"legal_name": "ACME Inc", "country_code": "US"}, ) # Set up weak match (0.7 < threshold 0.8) @@ -136,7 +136,9 @@ def test_below_threshold_becomes_singleton(service): # m2 should be assigned to its own cluster (cluster "m2"), # but genCand still includes m1's cluster (via the below-threshold link) - assert result2.top.cluster_id.value == "m1" # Still top by score, but own cluster also present + assert ( + result2.top.cluster_id.value == "m1" + ) # Still top by score, but own cluster also present assert result2.top.score == pytest.approx(0.7, abs=0.01) # Verify the new invariant: own cluster is always included @@ -165,11 +167,11 @@ def test_gen_cand_includes_below_threshold_links(service): # Resolve m1 and m3 in cluster 1, m3 in cluster 3 m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m3 = Mention( id=MentionId(value="m3"), - attributes={"legal_name": "Globex", "country_code": "US"} + attributes={"legal_name": "Globex", "country_code": "US"}, ) service.resolve(m1) service.resolve(m3) # m3 forms its own cluster @@ -179,7 +181,7 @@ def test_gen_cand_includes_below_threshold_links(service): # - weak link (0.7) to m3 (cluster "m3") -> below threshold m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) service._linker = FixedSimilarityLinker( @@ -210,11 +212,11 @@ def test_gen_cand_groups_by_cluster(service): # Cluster 1: m1, m2 m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) service.resolve(m1) service._linker = FixedSimilarityLinker({frozenset(["m1", "m2"]): 0.95}) @@ -224,7 +226,7 @@ def test_gen_cand_groups_by_cluster(service): # m3 has weak links to both m1 (0.75) and m2 (0.85) in the same cluster m3 = Mention( id=MentionId(value="m3"), - attributes={"legal_name": "Acme Industries", "country_code": "US"} + attributes={"legal_name": "Acme Industries", "country_code": "US"}, ) service._linker = FixedSimilarityLinker( @@ -258,7 +260,7 @@ def test_train_can_be_called_anytime(service): attributes={ "legal_name": "Company 1", "country_code": "US", - } + }, ) service.resolve(mention) @@ -313,7 +315,7 @@ def counting_train(): attributes={ "legal_name": f"Company {i}", "country_code": "US", - } + }, ) service.resolve(mention) service._linker.register_mention(mention) @@ -326,11 +328,11 @@ def test_state_reflects_mentions(service): """State should reflect all resolved mentions.""" m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) service.resolve(m1) @@ -348,7 +350,7 @@ def test_state_reflects_clusters(service): """State should reflect cluster membership.""" m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) service.resolve(m1) @@ -362,11 +364,11 @@ def test_state_reflects_similarities(service): """State should reflect all stored similarities.""" m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Acme Corp", "country_code": "US"} + attributes={"legal_name": "Acme Corp", "country_code": "US"}, ) service.resolve(m1) @@ -392,7 +394,7 @@ def test_resolution_result_never_empty(service): """Every resolve() call should return non-empty ResolutionResult.""" m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) result = service.resolve(m1) @@ -437,7 +439,7 @@ def test_resolution_result_always_top_n_pruned(service): for i in range(2, 7): mention = Mention( id=MentionId(value=f"m{i}"), - attributes={"legal_name": f"Company {i}", "country_code": "US"} + attributes={"legal_name": f"Company {i}", "country_code": "US"}, ) service.resolve(mention) @@ -447,7 +449,7 @@ def test_resolution_result_always_top_n_pruned(service): m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Company 1", "country_code": "US"} + attributes={"legal_name": "Company 1", "country_code": "US"}, ) result = service.resolve(m1) @@ -459,15 +461,15 @@ def test_multiple_independent_clusters(service): """Mentions with no links should form independent clusters.""" m1 = Mention( id=MentionId(value="m1"), - attributes={"legal_name": "Acme", "country_code": "US"} + attributes={"legal_name": "Acme", "country_code": "US"}, ) m2 = Mention( id=MentionId(value="m2"), - attributes={"legal_name": "Globex", "country_code": "US"} + attributes={"legal_name": "Globex", "country_code": "US"}, ) m3 = Mention( id=MentionId(value="m3"), - attributes={"legal_name": "Initech", "country_code": "US"} + attributes={"legal_name": "Initech", "country_code": "US"}, ) # No links between any of them From e2f5af37cd68406aeffdd9aaeb5b442435f94cb2 Mon Sep 17 00:00:00 2001 From: Twicechild Date: Thu, 26 Mar 2026 14:52:43 +0200 Subject: [PATCH 04/14] refactor(docker): multi-stage build, .dockerignore, non-root user --- .dockerignore | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ .gitattributes | 4 ++++ infra/Dockerfile | 59 ++++++++++++++++++++++++++++++++---------------- 3 files changed, 102 insertions(+), 20 deletions(-) create mode 100644 .dockerignore create mode 100644 .gitattributes diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..33f05e6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,59 @@ +# Version control +.git +.gitignore +.gitattributes + +# Python +__pycache__ +*.pyc +*.pyo +.venv +.mypy_cache +.pytest_cache +.ruff_cache +*.egg-info + +# IDE +.vscode +.idea + +# Environment +.env +.env.* +!.env.example + +# Docker (no need to send these into the build context) +infra/Dockerfile +infra/compose.dev.yaml +infra/README.md +infra/.env* + +# AI / tooling config +.claude +CLAUDE.md + +# CI +.github + +# Docs +docs + +# Build artifacts and data +dist +build +data +reports +coverage.xml +htmlcov +.coverage +.tox + +# Tests and demo (not needed at runtime) +test +demo + +# Project config (not needed at runtime) +sonar-project.properties +.pylintrc +.importlinter +tox.ini diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bfec021 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,4 @@ +# Enforce Unix line endings +*.sh text eol=lf +Dockerfile text eol=lf +*.yaml text eol=lf diff --git a/infra/Dockerfile b/infra/Dockerfile index 5eb0407..169e41b 100644 --- a/infra/Dockerfile +++ b/infra/Dockerfile @@ -1,38 +1,57 @@ -# ── ERE application image ────────────────────────────────────────────────── -# Builds the Entity Resolution Engine service for local development. -# Requires only Docker — no local Python, Redis, or DuckDB installation. -# +# Multi-stage build for the Entity Resolution Engine. # Build context: repository root (one level above /infra) -# Usage: docker compose -f infra/docker-compose.yml up --build -# ─────────────────────────────────────────────────────────────────────────── -FROM python:3.12-slim +# ============================================================================= +# Builder stage: install dependencies +# ============================================================================= +FROM python:3.12-slim AS builder + +ARG POETRY_VERSION=">=2.0.0,<3.0.0" + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 # git is required to fetch the ers-spec dependency from GitHub RUN apt-get update \ && apt-get install -y --no-install-recommends git \ && rm -rf /var/lib/apt/lists/* -# Install Poetry (locked to major version 2) -RUN pip install --no-cache-dir "poetry>=2.0.0,<3.0.0" +RUN pip install --no-cache-dir "poetry${POETRY_VERSION}" WORKDIR /app -# ── Dependency layer (cached unless pyproject.toml / poetry.lock change) ─── -COPY pyproject.toml poetry.lock* ./ +COPY pyproject.toml poetry.lock ./ -# Install into system Python (no virtualenv needed inside the container) -RUN poetry config virtualenvs.create false \ - && poetry install --without dev --no-root --no-interaction +RUN poetry install --without dev --no-root -# ── Application source ────────────────────────────────────────────────────── COPY README.md ./ COPY src/ ./src/ -COPY infra/config/ ./config/ -# Install the ere package itself -RUN poetry install --without dev --no-interaction +RUN poetry install --without dev + +# ============================================================================= +# Runtime stage: minimal image +# ============================================================================= +FROM python:3.12-slim AS runtime + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PATH="/app/.venv/bin:${PATH}" + +RUN groupadd --gid 1000 appuser && \ + useradd --uid 1000 --gid appuser --shell /bin/bash --create-home appuser + +WORKDIR /app + +COPY --from=builder /app/.venv .venv +COPY --from=builder /app/src src +COPY config/ ./config/ + +# Volume mount point for DuckDB persistent storage +RUN mkdir -p /data && chown appuser:appuser /data + +USER appuser -# ── Runtime ───────────────────────────────────────────────────────────────── -# Fail fast: Python will exit immediately if the module cannot be imported. CMD ["python", "-m", "ere.entrypoints.app"] From 1cf319c7ae95ee4781129063dd65ff8905ba17d9 Mon Sep 17 00:00:00 2001 From: Twicechild Date: Thu, 26 Mar 2026 14:53:01 +0200 Subject: [PATCH 05/14] refactor(infra): modernize compose, env, Makefile and config layout --- .gitignore | 1 + Makefile | 51 +++++++++--- {infra/config => config}/README.md | 0 {infra/config => config}/rdf_mapping.yaml | 40 +++++----- {infra/config => config}/resolver.yaml | 0 .../config => config}/resolver_compound.yaml | 50 ++++++------ .../config => config}/resolver_multirule.yaml | 56 ++++++------- infra/.env.example | 18 +++++ infra/.env.local | 28 ------- infra/README.md | 71 +++++++++++++++++ infra/compose.dev.yaml | 78 +++++++++++++++++++ infra/docker-compose.yml | 66 ---------------- src/ere/adapters/rdf_mapper_impl.py | 14 +++- src/ere/services/factories.py | 9 ++- 14 files changed, 299 insertions(+), 183 deletions(-) rename {infra/config => config}/README.md (100%) rename {infra/config => config}/rdf_mapping.yaml (97%) rename {infra/config => config}/resolver.yaml (100%) rename {infra/config => config}/resolver_compound.yaml (96%) rename {infra/config => config}/resolver_multirule.yaml (96%) create mode 100644 infra/.env.example delete mode 100644 infra/.env.local create mode 100644 infra/README.md create mode 100644 infra/compose.dev.yaml delete mode 100644 infra/docker-compose.yml diff --git a/.gitignore b/.gitignore index a75bb70..65f3ffe 100644 --- a/.gitignore +++ b/.gitignore @@ -136,6 +136,7 @@ celerybeat.pid # Environments .env +infra/.env .envrc .venv env/ diff --git a/Makefile b/Makefile index 6a6cfa1..933f068 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,8 @@ SRC_PATH = ${PROJECT_PATH}/src TEST_PATH = ${PROJECT_PATH}/test BUILD_PATH = ${PROJECT_PATH}/dist INFRA_PATH = ${PROJECT_PATH}/infra +COMPOSE_FILE = ${INFRA_PATH}/compose.dev.yaml +ENV_FILE = ${INFRA_PATH}/.env PACKAGE_NAME = ere ICON_DONE = [✔] @@ -66,9 +68,13 @@ help: ## Display available targets @ echo "" @ echo -e " $(BUILD_PRINT)Infrastructure (Docker):$(END_BUILD_PRINT)" @ echo " infra-build - Build the ERE Docker image" - @ echo " infra-up - Start full stack (Redis + ERE) in detached mode" + @ echo " infra-up - Start services (docker compose up -d)" @ echo " infra-down - Stop and remove stack containers and networks" - @ echo " infra-logs - Tail ERE container logs" + @ echo " infra-down-volumes - Stop services and remove volumes (clean slate)" + @ echo " infra-rebuild - Rebuild images and start services" + @ echo " infra-rebuild-clean - Rebuild from scratch (no cache) and start" + @ echo " infra-logs - Follow service logs" + @ echo " infra-watch - Start services with file watching (sync src/ and config/)" @ echo "" @ echo -e " $(BUILD_PRINT)Utilities:$(END_BUILD_PRINT)" @ echo " clean - Remove build artifacts and caches" @@ -158,25 +164,48 @@ ci: ## Full CI pipeline for GitHub Actions (tox) #----------------------------------------------------------------------------- # Infrastructure commands (Docker) #----------------------------------------------------------------------------- -.PHONY: infra-build infra-up infra-down infra-logs +.PHONY: check-env infra-build infra-up infra-down infra-down-volumes infra-rebuild infra-rebuild-clean infra-logs infra-watch -infra-build: ## Build the ERE Docker image +check-env: + @ test -f $(ENV_FILE) || (echo -e "$(BUILD_PRINT)$(ICON_ERROR) Missing $(ENV_FILE). Run: cp infra/.env.example infra/.env$(END_BUILD_PRINT)" && exit 1) + +infra-build: check-env ## Build the ERE Docker image @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Building ERE Docker image$(END_BUILD_PRINT)" - @ docker compose -f $(INFRA_PATH)/docker-compose.yml build + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) build @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE image built$(END_BUILD_PRINT)" -infra-up: ## Start full stack: Redis + ERE (docker compose up --build) +infra-up: check-env ## Start services (docker compose up -d) @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Starting ERE stack$(END_BUILD_PRINT)" - @ docker compose -f $(INFRA_PATH)/docker-compose.yml up --build -d + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up -d @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE stack is running — use 'make infra-logs' to follow output$(END_BUILD_PRINT)" -infra-down: ## Stop and remove ERE stack containers and networks +infra-down: check-env ## Stop and remove ERE stack containers and networks @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Stopping ERE stack$(END_BUILD_PRINT)" - @ docker compose -f $(INFRA_PATH)/docker-compose.yml down + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) down @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE stack stopped$(END_BUILD_PRINT)" -infra-logs: ## Tail logs from the ERE container - @ docker compose -f $(INFRA_PATH)/docker-compose.yml logs -f ere +infra-down-volumes: check-env ## Stop services and remove volumes (clean slate) + @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Stopping ERE stack and removing volumes$(END_BUILD_PRINT)" + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) down -v + @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE stack stopped and volumes removed$(END_BUILD_PRINT)" + +infra-rebuild: check-env ## Rebuild images and start services + @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Rebuilding ERE stack$(END_BUILD_PRINT)" + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up -d --build + @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE stack rebuilt and started$(END_BUILD_PRINT)" + +infra-rebuild-clean: check-env ## Rebuild from scratch (no cache) and start + @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Rebuilding ERE stack (no cache)$(END_BUILD_PRINT)" + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) build --no-cache + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) up -d + @ echo -e "$(BUILD_PRINT)$(ICON_DONE) ERE stack rebuilt (clean) and started$(END_BUILD_PRINT)" + +infra-logs: check-env ## Follow service logs + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) logs -f + +infra-watch: check-env ## Start services with file watching (sync src/ and config/) + @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Starting ERE stack with watch$(END_BUILD_PRINT)" + @ docker compose -f $(COMPOSE_FILE) --env-file $(ENV_FILE) watch #----------------------------------------------------------------------------- # Utility commands diff --git a/infra/config/README.md b/config/README.md similarity index 100% rename from infra/config/README.md rename to config/README.md diff --git a/infra/config/rdf_mapping.yaml b/config/rdf_mapping.yaml similarity index 97% rename from infra/config/rdf_mapping.yaml rename to config/rdf_mapping.yaml index 8dd02e2..4b856ed 100644 --- a/infra/config/rdf_mapping.yaml +++ b/config/rdf_mapping.yaml @@ -1,20 +1,20 @@ -# Namespace prefix registry - used by rdf_mapper.py to resolve prefixed names in field paths -namespaces: - epo: "http://data.europa.eu/a4g/ontology#" - org: "http://www.w3.org/ns/org#" - locn: "http://www.w3.org/ns/locn#" - cccev: "http://data.europa.eu/m8g/" - -# Entity type mappings: entity_type_string -> rdf_type + field property paths -# Property paths use / as separator for multi-hop traversal. -# Field names must match entity_fields in resolver.yaml (legal_name, country_code). -entity_types: - ORGANISATION: - rdf_type: "org:Organization" - fields: - legal_name: "epo:hasLegalName" - country_code: "cccev:registeredAddress/epo:hasCountryCode" - nuts_code: "cccev:registeredAddress/epo:hasNutsCode" - post_code: "cccev:registeredAddress/locn:postCode" - post_name: "cccev:registeredAddress/locn:postName" - thoroughfare: "cccev:registeredAddress/locn:thoroughfare" +# Namespace prefix registry - used by rdf_mapper.py to resolve prefixed names in field paths +namespaces: + epo: "http://data.europa.eu/a4g/ontology#" + org: "http://www.w3.org/ns/org#" + locn: "http://www.w3.org/ns/locn#" + cccev: "http://data.europa.eu/m8g/" + +# Entity type mappings: entity_type_string -> rdf_type + field property paths +# Property paths use / as separator for multi-hop traversal. +# Field names must match entity_fields in resolver.yaml (legal_name, country_code). +entity_types: + ORGANISATION: + rdf_type: "org:Organization" + fields: + legal_name: "epo:hasLegalName" + country_code: "cccev:registeredAddress/epo:hasCountryCode" + nuts_code: "cccev:registeredAddress/epo:hasNutsCode" + post_code: "cccev:registeredAddress/locn:postCode" + post_name: "cccev:registeredAddress/locn:postName" + thoroughfare: "cccev:registeredAddress/locn:thoroughfare" diff --git a/infra/config/resolver.yaml b/config/resolver.yaml similarity index 100% rename from infra/config/resolver.yaml rename to config/resolver.yaml diff --git a/infra/config/resolver_compound.yaml b/config/resolver_compound.yaml similarity index 96% rename from infra/config/resolver_compound.yaml rename to config/resolver_compound.yaml index 9cac682..47ff9d9 100644 --- a/infra/config/resolver_compound.yaml +++ b/config/resolver_compound.yaml @@ -1,25 +1,25 @@ -# Entity Resolver configuration — Compound blocking (country_code AND city) -# Blocks pairs unless both country_code AND city match. -# Creates tight, city-level blocks within countries. -# Trade-off: fewer comparisons (faster) but may miss cross-city variants. - -cache_strategy: tf_incremental - -threshold: 0.5 - -top_n: 100 - -match_weight_threshold: -10 - -splink: - probability_two_random_records_match: 0.3 - - comparisons: - - type: jaro_winkler - field: legal_name - thresholds: [0.9, 0.8] - - # Compound blocking rule: a pair is compared only if both country_code AND city match. - # This is expressed as a list with two fields. - blocking_rules: - - [country_code, city] +# Entity Resolver configuration — Compound blocking (country_code AND city) +# Blocks pairs unless both country_code AND city match. +# Creates tight, city-level blocks within countries. +# Trade-off: fewer comparisons (faster) but may miss cross-city variants. + +cache_strategy: tf_incremental + +threshold: 0.5 + +top_n: 100 + +match_weight_threshold: -10 + +splink: + probability_two_random_records_match: 0.3 + + comparisons: + - type: jaro_winkler + field: legal_name + thresholds: [0.9, 0.8] + + # Compound blocking rule: a pair is compared only if both country_code AND city match. + # This is expressed as a list with two fields. + blocking_rules: + - [country_code, city] diff --git a/infra/config/resolver_multirule.yaml b/config/resolver_multirule.yaml similarity index 96% rename from infra/config/resolver_multirule.yaml rename to config/resolver_multirule.yaml index 6e76a8c..c8395c9 100644 --- a/infra/config/resolver_multirule.yaml +++ b/config/resolver_multirule.yaml @@ -1,28 +1,28 @@ -# Entity Resolver configuration — Multi-rule blocking (country OR city OR name) -# Three independent blocking rules evaluated as OR (union). -# A pair is included if any rule fires: same country, OR same city, OR exact name match. -# Trade-off: more comparisons (slower) but higher recall for diverse datasets. - -cache_strategy: tf_incremental - -threshold: 0.5 - -top_n: 100 - -match_weight_threshold: -10 - -splink: - probability_two_random_records_match: 0.3 - - comparisons: - - type: jaro_winkler - field: legal_name - thresholds: [0.9, 0.8] - - # Multi-rule blocking: three independent rules, evaluated as UNION ALL. - # A pair is included if any rule fires (country_code match, OR city match, OR exact legal_name match). - # Splink deduplicates the results internally. - blocking_rules: - - country_code - - city - - legal_name +# Entity Resolver configuration — Multi-rule blocking (country OR city OR name) +# Three independent blocking rules evaluated as OR (union). +# A pair is included if any rule fires: same country, OR same city, OR exact name match. +# Trade-off: more comparisons (slower) but higher recall for diverse datasets. + +cache_strategy: tf_incremental + +threshold: 0.5 + +top_n: 100 + +match_weight_threshold: -10 + +splink: + probability_two_random_records_match: 0.3 + + comparisons: + - type: jaro_winkler + field: legal_name + thresholds: [0.9, 0.8] + + # Multi-rule blocking: three independent rules, evaluated as UNION ALL. + # A pair is included if any rule fires (country_code match, OR city match, OR exact legal_name match). + # Splink deduplicates the results internally. + blocking_rules: + - country_code + - city + - legal_name diff --git a/infra/.env.example b/infra/.env.example new file mode 100644 index 0000000..0057f84 --- /dev/null +++ b/infra/.env.example @@ -0,0 +1,18 @@ +# Copy this file to .env and customize as needed: +# cp infra/.env.example infra/.env + +# Redis +REDIS_HOST=redis +REDIS_PORT=6379 +REDIS_DB=0 +REDIS_PASSWORD=changeme + +# Queue names +REQUEST_QUEUE=ere_requests +RESPONSE_QUEUE=ere_responses + +# DuckDB (path inside container, volume-mounted) +DUCKDB_PATH=/data/app.duckdb + +# Logging +LOG_LEVEL=INFO diff --git a/infra/.env.local b/infra/.env.local deleted file mode 100644 index e89187b..0000000 --- a/infra/.env.local +++ /dev/null @@ -1,28 +0,0 @@ -# Copy this file to .env.local and customize as needed -# This file is a template for Docker Compose configuration - -# ── Redis Configuration ────────────────────────────────────────────────────── -# Inside Docker Compose, use 'redis' as hostname. For local testing, use 'localhost' -REDIS_HOST=redis -REDIS_PORT=6379 -REDIS_DB=0 - -# Redis authentication (recommended for security) -REDIS_PASSWORD=changeme - -# ── Redis Queue Names ──────────────────────────────────────────────────────── -# Queue names for entity resolution requests and responses -REQUEST_QUEUE=ere_requests -RESPONSE_QUEUE=ere_responses - -# ── DuckDB Persistent Storage ──────────────────────────────────────────────── -# Path to DuckDB file inside container (volume-mounted from ere-data volume) -DUCKDB_PATH=/data/app.duckdb - -# ── ERE Service Port ───────────────────────────────────────────────────────── -# Port exposed to host machine for the ERE service -APP_PORT=8000 - -# ── Logging ────────────────────────────────────────────────────────────────── -# Python logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) -LOG_LEVEL=INFO diff --git a/infra/README.md b/infra/README.md new file mode 100644 index 0000000..db68b06 --- /dev/null +++ b/infra/README.md @@ -0,0 +1,71 @@ +# Infrastructure + +Deployment and infrastructure files for the Entity Resolution Engine. + +## Structure + +``` +infra/ +├── .env.example # Environment variable template +├── compose.dev.yaml # Docker Compose for local development +├── Dockerfile # Multi-stage build (builder + runtime) +└── README.md +``` + +## Services + +| Service | Purpose | Port | +|---|---|---| +| `ere` | Entity Resolution Engine (Redis queue worker) | — (no HTTP API) | +| `redis` | Message queue for ERE requests/responses | 6379 | +| `redisinsight` | Redis GUI (development tool) | 5540 | + +## Usage + +All commands run from the repo root via `make`: + +```bash +make infra-build # Build the ERE Docker image +make infra-up # Start services (docker compose up -d) +make infra-down # Stop and remove containers and networks +make infra-down-volumes # Stop services and remove volumes (clean slate) +make infra-rebuild # Rebuild images and start services +make infra-rebuild-clean # Rebuild from scratch (no cache) +make infra-logs # Follow service logs +make infra-watch # Start services with file watching (sync src/ and config/) +``` + +### File watching (development) + +`make infra-watch` uses Docker Compose's `watch` feature to sync source code and +configuration changes into the running container without a full rebuild: + +- **Source changes** (`src/`) are synced live into the container +- **Config changes** (`config/`) are synced live into the container +- **Dependency changes** (`pyproject.toml`, `poetry.lock`) trigger a full rebuild + +> **Note:** ERE is a long-running queue worker, not an HTTP server with hot-reload. +> After syncing, restart the container to pick up changes: `docker compose -f infra/compose.dev.yaml restart ere` + +### Manual build + +```bash +docker build -f infra/Dockerfile -t ere:latest . +``` + +## Configuration + +Environment variables are loaded from `infra/.env`. See `infra/.env.example` for available options. To set up: + +```bash +cp infra/.env.example infra/.env +``` + +### Resolver configuration + +Entity resolution behaviour is configured via YAML files in the top-level `config/` directory: + +- **[resolver.yaml](../config/resolver.yaml)** — Splink comparisons, cold-start parameters, blocking rules, thresholds +- **[rdf_mapping.yaml](../config/rdf_mapping.yaml)** — RDF namespace bindings, field extraction rules, entity type definitions + +See the [configuration README](../config/README.md) for detailed tuning guidance. diff --git a/infra/compose.dev.yaml b/infra/compose.dev.yaml new file mode 100644 index 0000000..1fddb3a --- /dev/null +++ b/infra/compose.dev.yaml @@ -0,0 +1,78 @@ +# Docker Compose configuration for local development + +name: ere-local + +services: + redis: + image: redis:7-alpine + container_name: "redis" + restart: unless-stopped + command: redis-server --requirepass ${REDIS_PASSWORD:-changeme} + ports: + - "6379:6379" + healthcheck: + test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $REDIS_PASSWORD ping"] + interval: 5s + timeout: 3s + retries: 5 + environment: + - REDIS_PASSWORD=${REDIS_PASSWORD:-changeme} + networks: + - ere-net + + redisinsight: + image: redis/redisinsight:3.2.0 + container_name: "redisinsight" + restart: unless-stopped + ports: + - "5540:5540" + healthcheck: + test: ["CMD", "wget", "--spider", "-q", "http://127.0.0.1:5540/api/health"] + interval: 5s + timeout: 3s + retries: 5 + networks: + - ere-net + + ere: + build: + context: .. + dockerfile: infra/Dockerfile + container_name: "ere" + env_file: .env + restart: unless-stopped + environment: + - DUCKDB_PATH=${DUCKDB_PATH:-/data/app.duckdb} + - RDF_MAPPING_PATH=/app/config/rdf_mapping.yaml + - RESOLVER_CONFIG_PATH=/app/config/resolver.yaml + # Remaining REDIS_* and queue vars inherited from env_file + healthcheck: + test: ["CMD", "sh", "-c", "test -f /proc/1/cmdline"] + interval: 10s + timeout: 3s + retries: 3 + depends_on: + redis: + condition: service_healthy + volumes: + - ere-data:/data + develop: + watch: + - action: sync + path: ../src + target: /app/src + - action: sync + path: ../config + target: /app/config + - action: rebuild + path: ../pyproject.toml + - action: rebuild + path: ../poetry.lock + networks: + - ere-net + +volumes: + ere-data: + +networks: + ere-net: diff --git a/infra/docker-compose.yml b/infra/docker-compose.yml deleted file mode 100644 index ef5b8df..0000000 --- a/infra/docker-compose.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: ere-local - -services: - - # ── Redis ────────────────────────────────────────────────────────────────── - redis: - image: redis:7-alpine - restart: unless-stopped - command: redis-server --requirepass ${REDIS_PASSWORD:-changeme} - ports: - - "6379:6379" - networks: - - ere-net - healthcheck: - test: ["CMD", "sh", "-c", "redis-cli --no-auth-warning -a $REDIS_PASSWORD ping"] - interval: 5s - timeout: 3s - retries: 5 - environment: - - REDIS_PASSWORD=${REDIS_PASSWORD:-changeme} - - - # ── Redis Insight (GUI for Redis) ────────────────────────────────────────── - redisinsight: - image: redis/redisinsight:latest - restart: unless-stopped - ports: - - "5540:5540" - networks: - - ere-net - environment: - # Optional: set analytics to false if you prefer no telemetry - - REDISINSIGHT_ANALYTICS=true - - - # ── Entity Resolution Engine ─────────────────────────────────────────────── - ere: - build: - context: .. - dockerfile: infra/Dockerfile - env_file: .env.local - restart: unless-stopped - ports: - - "${APP_PORT:-8000}:8000" - environment: - # DuckDB embedded file location (volume-mounted at /data) - - DUCKDB_PATH=${DUCKDB_PATH:-/data/app.duckdb} - # Config file paths in the container - - RDF_MAPPING_PATH=/app/config/rdf_mapping.yaml - - RESOLVER_CONFIG_PATH=/app/config/resolver.yaml - # Inherit REQUEST_QUEUE, RESPONSE_QUEUE, REDIS_* from .env.local - depends_on: - redis: - condition: service_healthy - volumes: - - ere-data:/data # DuckDB embedded file and other persistent state - networks: - - ere-net - -# ── Shared state ─────────────────────────────────────────────────────────── -volumes: - ere-data: - -# ── Internal network (not exposed to host) ───────────────────────────────── -networks: - ere-net: diff --git a/src/ere/adapters/rdf_mapper_impl.py b/src/ere/adapters/rdf_mapper_impl.py index 243f6b1..62554fc 100644 --- a/src/ere/adapters/rdf_mapper_impl.py +++ b/src/ere/adapters/rdf_mapper_impl.py @@ -42,7 +42,11 @@ def _load_mappings(rdf_mapping_path: str | Path = None) -> dict: dict: Entity type mappings from config. """ if rdf_mapping_path is None: - rdf_mapping_path = Path(__file__).parent.parent.parent.parent / "infra" / "config" / "rdf_mapping.yaml" + rdf_mapping_path = ( + Path(__file__).parent.parent.parent.parent + / "config" + / "rdf_mapping.yaml" + ) else: rdf_mapping_path = Path(rdf_mapping_path) return load_entity_mappings(rdf_mapping_path) @@ -70,9 +74,13 @@ def map_entity_mention_to_domain(self, entity_mention: EntityMention) -> Mention ) mention_id = MentionId( - value=self._derive_mention_id(eid.source_id, eid.request_id, eid.entity_type) + value=self._derive_mention_id( + eid.source_id, eid.request_id, eid.entity_type + ) + ) + attributes = extract_mention_attributes( + entity_mention.content, entity_type_config ) - attributes = extract_mention_attributes(entity_mention.content, entity_type_config) return Mention(id=mention_id, attributes=attributes) @staticmethod diff --git a/src/ere/services/factories.py b/src/ere/services/factories.py index 6442ae8..debd261 100644 --- a/src/ere/services/factories.py +++ b/src/ere/services/factories.py @@ -19,7 +19,10 @@ from ere.adapters.duckdb_schema import init_schema from ere.adapters.rdf_mapper_port import RDFMapper from ere.adapters.splink_linker_impl import SpLinkSimilarityLinker -from ere.services.entity_resolution_service import EntityResolver, EntityResolutionService +from ere.services.entity_resolution_service import ( + EntityResolver, + EntityResolutionService, +) from ere.services.resolver_config import ResolverConfig @@ -47,7 +50,9 @@ def build_entity_resolver( Fully-constructed EntityResolver with DuckDB backend and Splink linker. """ if resolver_config_path is None: - config_path = Path(__file__).parent.parent.parent.parent / "infra" / "config" / "resolver.yaml" + config_path = ( + Path(__file__).parent.parent.parent.parent / "config" / "resolver.yaml" + ) else: config_path = Path(resolver_config_path) From ee8e8cb380f461a31066fcd80357e1499f8e229d Mon Sep 17 00:00:00 2001 From: Twicechild Date: Thu, 26 Mar 2026 14:53:09 +0200 Subject: [PATCH 06/14] fix(docs): update references for new infra layout --- .github/workflows/code-quality.yaml | 10 ++-- CHANGELOG.md | 2 +- README.md | 35 +++++++++----- demo/README.md | 13 +++--- demo/demo.py | 72 +++++++++++++++++++++-------- docs/algorithm.md | 2 +- test/stress/README.md | 2 +- test/stress/stress_test.py | 9 ++-- 8 files changed, 94 insertions(+), 51 deletions(-) diff --git a/.github/workflows/code-quality.yaml b/.github/workflows/code-quality.yaml index 4369e8e..5bc9317 100644 --- a/.github/workflows/code-quality.yaml +++ b/.github/workflows/code-quality.yaml @@ -7,8 +7,8 @@ # 2. Lint, Test & Verify (tox: unit tests + architecture + clean-code checks) # 3. SonarCloud analysis (coverage, quality gate) # -# Optional repository secrets: -# - SONAR_TOKEN: SonarCloud authentication token (step skipped when absent) +# Required repository secrets: +# - SONAR_TOKEN: SonarCloud authentication token # # If the private ers-spec dependency fails to resolve with the default # GITHUB_TOKEN, add a PAT as GH_TOKEN_PRIVATE_REPOS and uncomment the @@ -29,8 +29,6 @@ jobs: quality: name: Lint, Test & Verify runs-on: ubuntu-latest - env: - SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} services: redis: @@ -96,14 +94,14 @@ jobs: # ------------------------------------------------------------------ - name: Run quality checks (unit tests + architecture + clean-code) run: | - rm -f infra/.env.local + rm -f infra/.env poetry run tox -e py312,architecture,clean-code # ------------------------------------------------------------------ # SonarCloud # ------------------------------------------------------------------ - name: SonarCloud scan - if: always() && env.SONAR_TOKEN != '' + if: always() uses: SonarSource/sonarqube-scan-action@v6 env: SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index 9980d99..3a6296f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,7 +65,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 **Docker & Deployment** - Multi-stage Dockerfile for production-ready containerization -- `docker-compose.yml` for full-stack setup (Redis + ERE service) +- `compose.dev.yaml` for full-stack setup (Redis + ERE service) - `.env.example` template for configuration **Documentation** diff --git a/README.md b/README.md index 81e76ca..442f396 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Its primary purpose is to interact with the Entity Resolution System (ERSys). It For detailed documentation, see: - [Architecture](docs/architecture.md) - description of the applied architecture - [Algorithm](docs/algorithm.md) - incremental probabilistic entity linking -- [Configuration](infra/config/README.md) - field mapping, model tuning, Splink setup +- [Configuration](config/README.md) - field mapping, model tuning, Splink setup - [ERS–ERE Technical Contract v0.2](docs/ERS-ERE-System-Technical-Contract.pdf) @@ -66,7 +66,10 @@ make install ``` To build and launch Docker-based stack (ERE + Redis): -1. (optional) Adjust connection and logging config in [.env.local](infra/.env.local). +1. (optional) Copy and adjust connection and logging config: + ```bash + cp infra/.env.example infra/.env + ``` 2. Run the following: ```bash # Build the ERE Docker image @@ -93,7 +96,7 @@ Terminate the service: make infra-down ``` -Note: In order for the demo to work, you need to either set `REDIS_HOST=localhost` in the [.env.local](infra/.env.local) file or pass it to the script as an environment variable. +Note: In order for the demo to work, you need to either set `REDIS_HOST=localhost` in [infra/.env](infra/.env.example) or pass it to the script as an environment variable. For detailed setup instructions, see `Make targets`. @@ -133,9 +136,13 @@ Available targets (`make help`): Infrastructure (Docker): infra-build - Build the ERE Docker image - infra-up - Start full stack (Redis + ERE) in detached mode + infra-up - Start services (docker compose up -d) infra-down - Stop and remove stack containers and networks - infra-logs - Tail ERE container logs + infra-down-volumes - Stop services and remove volumes (clean slate) + infra-rebuild - Rebuild images and start services + infra-rebuild-clean - Rebuild from scratch (no cache) and start + infra-logs - Follow service logs + infra-watch - Start services with file watching (sync src/ and config/) Utilities: clean - Remove build artifacts and caches @@ -145,10 +152,10 @@ Available targets (`make help`): ### Configuration (Resolver and Mapper) Entity resolution behaviour is configured via two YAML files: -- **Resolver configuration** ([resolver.yaml](./infra/config/resolver.yaml)): Splink comparisons, cold-start parameters, similarity thresholds -- **RDF mapping** ([rdf_mapping.yaml](./infra/config/rdf_mapping.yaml)): RDF namespace bindings, field extraction rules, entity type definitions +- **Resolver configuration** ([resolver.yaml](./config/resolver.yaml)): Splink comparisons, cold-start parameters, similarity thresholds +- **RDF mapping** ([rdf_mapping.yaml](./config/rdf_mapping.yaml)): RDF namespace bindings, field extraction rules, entity type definitions -For detailed configuration options and tuning, see the [configuration page](./infra/config/README.md). +For detailed configuration options and tuning, see the [configuration page](./config/README.md). ### Examples @@ -203,11 +210,15 @@ docs/ ├── ERS-ERE-System-Technical-Contract.pdf └── *.md # Topic documentation +config/ +├── resolver.yaml # Splink comparisons, blocking rules, thresholds +├── rdf_mapping.yaml # RDF namespace bindings, field extraction rules +└── README.md # Configuration documentation + infra/ ├── Dockerfile # ERE service image definition -├── docker-compose.yml # Full stack (Redis + ERE) -├── config # ERE Configuration -└── .env.local # Local runtime config (git-ignored) +├── compose.dev.yaml # Docker Compose for local development +└── .env.example # Environment variable template demo/ ├── demo.py # Entity resolution demonstration script @@ -266,7 +277,7 @@ make test-integration # Code formatting and linting make format # Auto-format with Ruff -make lint-check # Lint without modifying files +make lint # Lint without modifying files make lint-fix # Lint with auto-fix ``` diff --git a/demo/README.md b/demo/README.md index aa45f79..4a1ce9f 100644 --- a/demo/README.md +++ b/demo/README.md @@ -18,7 +18,7 @@ The demo treats ERE as a black box service accessible only through Redis message ## Configuration -Configuration is loaded from `.env.local` (or environment variables): +Configuration is loaded from `infra/.env` (or environment variables): | Variable | Default | Purpose | |----------|---------|---------| @@ -44,14 +44,13 @@ The script tries the configured host first, then falls back to `localhost` if th Start the full stack including Redis and ERE: ```bash -cd /home/greg/PROJECTS/ERS/ere-basic -docker-compose -f infra/docker-compose.yml up -d +make infra-rebuild ``` Wait for services to be ready (check logs): ```bash -docker-compose -f infra/docker-compose.yml logs -f +make infra-logs ``` ### 2. Locally (development) @@ -205,7 +204,7 @@ If it returns `PONG`, Redis is running. If not: - **Docker**: `docker run -d -p 6379:6379 redis:latest` - **Local Redis**: `brew install redis && brew services start redis` (macOS) -- **Docker Compose**: Ensure the service is running: `docker-compose -f infra/docker-compose.yml up redis` +- **Docker Compose**: Ensure the service is running: `make infra-up` ### Timeout waiting for responses @@ -216,14 +215,14 @@ If it returns `PONG`, Redis is running. If not: **Check ERE logs:** ```bash -docker-compose -f infra/docker-compose.yml logs ere +make infra-logs ``` ### Password authentication fails **Edit Redis connection parameters:** -Option 1: Modify `.env.local`: +Option 1: Modify `infra/.env`: ```bash REDIS_PASSWORD=your_password ``` diff --git a/demo/demo.py b/demo/demo.py index 711178d..f8d33bf 100755 --- a/demo/demo.py +++ b/demo/demo.py @@ -17,7 +17,7 @@ Before running a fresh demo with different data, clear the old database: docker volume rm ere-local_ere-data - docker-compose -f infra/docker-compose.yml up -d + make infra-rebuild Failure to do so will mix old mentions with new ones, corrupting demo results. """ @@ -35,7 +35,9 @@ # Default data file path DEFAULT_DATA_FILE = Path(__file__).parent / "data" / "org-tiny.json" -DELAY_BETWEEN_MESSAGES = 0 # seconds to wait between sending messages (set to >0 for sequential processing) +DELAY_BETWEEN_MESSAGES = ( + 0 # seconds to wait between sending messages (set to >0 for sequential processing) +) GLOBAL_TIMEOUT = 0 # seconds to wait for responses before giving up (0 = no timeout) @@ -43,13 +45,14 @@ # Configuration # =============================================================================== + def load_env_file(env_path: str = None) -> dict: - """Load configuration from .env.local or environment variables.""" + """Load configuration from .env or environment variables.""" config = {} - # Try to load from .env.local if it exists + # Try to load from .env if it exists if env_path is None: - env_path = Path(__file__).parent.parent / "infra" / ".env.local" + env_path = Path(__file__).parent.parent / "infra" / ".env" if Path(env_path).exists(): with open(env_path) as f: @@ -60,13 +63,23 @@ def load_env_file(env_path: str = None) -> dict: key, value = line.split("=", 1) config[key.strip()] = value.strip() - # Environment variables override .env.local - config["REDIS_HOST"] = os.environ.get("REDIS_HOST", config.get("REDIS_HOST", "localhost")) - config["REDIS_PORT"] = int(os.environ.get("REDIS_PORT", config.get("REDIS_PORT", "6379"))) + # Environment variables override .env + config["REDIS_HOST"] = os.environ.get( + "REDIS_HOST", config.get("REDIS_HOST", "localhost") + ) + config["REDIS_PORT"] = int( + os.environ.get("REDIS_PORT", config.get("REDIS_PORT", "6379")) + ) config["REDIS_DB"] = int(os.environ.get("REDIS_DB", config.get("REDIS_DB", "0"))) - config["REDIS_PASSWORD"] = os.environ.get("REDIS_PASSWORD", config.get("REDIS_PASSWORD")) - config["REQUEST_QUEUE"] = os.environ.get("REQUEST_QUEUE", config.get("REQUEST_QUEUE", "ere_requests")) - config["RESPONSE_QUEUE"] = os.environ.get("RESPONSE_QUEUE", config.get("RESPONSE_QUEUE", "ere_responses")) + config["REDIS_PASSWORD"] = os.environ.get( + "REDIS_PASSWORD", config.get("REDIS_PASSWORD") + ) + config["REQUEST_QUEUE"] = os.environ.get( + "REQUEST_QUEUE", config.get("REQUEST_QUEUE", "ere_requests") + ) + config["RESPONSE_QUEUE"] = os.environ.get( + "RESPONSE_QUEUE", config.get("RESPONSE_QUEUE", "ere_responses") + ) return config @@ -77,6 +90,7 @@ def load_env_file(env_path: str = None) -> dict: TRACE = 5 + def setup_logging(): """Configure logging with timestamps.""" log_level_name = os.environ.get("LOG_LEVEL", "INFO").upper() @@ -105,7 +119,10 @@ def setup_logging(): # Redis Connection # =============================================================================== -def check_redis_connectivity(host: str, port: int, db: int, password: str) -> redis.Redis: + +def check_redis_connectivity( + host: str, port: int, db: int, password: str +) -> redis.Redis: """ Check Redis connectivity and return client. @@ -124,7 +141,9 @@ def check_redis_connectivity(host: str, port: int, db: int, password: str) -> re last_error = None for try_host in hosts_to_try: try: - logging.getLogger(__name__).info(f"Attempting Redis connection to {try_host}:{port}...") + logging.getLogger(__name__).info( + f"Attempting Redis connection to {try_host}:{port}..." + ) client = redis.Redis( host=try_host, port=port, @@ -147,6 +166,7 @@ def check_redis_connectivity(host: str, port: int, db: int, password: str) -> re # Request/Response Handling # =============================================================================== + def escape_turtle_string(value: str) -> str: """ Escape a string for safe inclusion in Turtle RDF format. @@ -223,7 +243,7 @@ def create_entity_mention_request( thoroughfare_safe = escape_turtle_string(thoroughfare) address_props.append(f'locn:thoroughfare "{thoroughfare_safe}"') - address_content = ' ;\n '.join(address_props) + address_content = " ;\n ".join(address_props) content = f"""@prefix org: . @prefix cccev: . @@ -263,6 +283,7 @@ def parse_response(response_bytes: bytes) -> dict: # Demo Data Loading # =============================================================================== + def load_demo_mentions(data_file: str | None = None) -> list[dict]: """ Load demo mentions from a JSON file. @@ -298,6 +319,7 @@ def load_demo_mentions(data_file: str | None = None) -> list[dict]: # Main Demo # =============================================================================== + def main(data_file: str | None = None): """ Run the Redis-based ERE demo. @@ -323,7 +345,9 @@ def main(data_file: str | None = None): # Load demo mentions from JSON try: demo_mentions = load_demo_mentions(data_file) - logger.info(f"Loaded {len(demo_mentions)} mentions from {data_file or DEFAULT_DATA_FILE}") + logger.info( + f"Loaded {len(demo_mentions)} mentions from {data_file or DEFAULT_DATA_FILE}" + ) except (FileNotFoundError, ValueError) as e: logger.error(f"Failed to load demo mentions: {e}") return 1 @@ -357,7 +381,7 @@ def main(data_file: str | None = None): f" \n" f" To reset the database:\n" f" 1. docker volume rm ere-local_ere-data\n" - f" 2. docker-compose -f infra/docker-compose.yml up -d\n" + f" 2. make infra-rebuild\n" ) # Send demo requests @@ -414,7 +438,9 @@ def main(data_file: str | None = None): while len(responses_received) < len(request_ids): elapsed = time.time() - start_time if GLOBAL_TIMEOUT > 0 and elapsed > GLOBAL_TIMEOUT: - logger.warning(f"Timeout after {GLOBAL_TIMEOUT}s. Received {len(responses_received)}/{len(request_ids)} responses.") + logger.warning( + f"Timeout after {GLOBAL_TIMEOUT}s. Received {len(responses_received)}/{len(request_ids)} responses." + ) break # Try to get a response with short timeout @@ -425,7 +451,9 @@ def main(data_file: str | None = None): response = parse_response(response_bytes) if logger.isEnabledFor(TRACE): - logger.log(TRACE, f"Full response message:\n{json.dumps(response, indent=2)}") + logger.log( + TRACE, f"Full response message:\n{json.dumps(response, indent=2)}" + ) req_id = response["entity_mention_id"]["request_id"] responses_received[req_id] = response @@ -455,7 +483,9 @@ def main(data_file: str | None = None): ) logger.info("-" * 80) - logger.info(f"\nDemo complete. Received {len(responses_received)}/{len(request_ids)} responses.") + logger.info( + f"\nDemo complete. Received {len(responses_received)}/{len(request_ids)} responses." + ) # Build clustering summary as single block summary_lines = [] @@ -510,7 +540,9 @@ def main(data_file: str | None = None): logger.info("✓ All responses received successfully!") return 0 else: - logger.warning(f"✗ Missing {len(request_ids) - len(responses_received)} response(s).") + logger.warning( + f"✗ Missing {len(request_ids) - len(responses_received)} response(s)." + ) return 1 diff --git a/docs/algorithm.md b/docs/algorithm.md index 7ad5bb7..201dbbd 100644 --- a/docs/algorithm.md +++ b/docs/algorithm.md @@ -100,7 +100,7 @@ The algorithm processes mentions one at a time, making immediate clustering deci | **top_n** | Maximum candidate clusters returned per mention | | **blocking_rules** | Pre-filters to reduce similarity computation | -The complete list of configuration parameters together with comprehensive description is available in [Configuration](../infra/config/README.md). +The complete list of configuration parameters together with comprehensive description is available in [Configuration](../config/README.md). ## Outputs diff --git a/test/stress/README.md b/test/stress/README.md index 0d69f7b..cd33711 100644 --- a/test/stress/README.md +++ b/test/stress/README.md @@ -45,7 +45,7 @@ poetry run python3 test/stress/stress_test.py \ ### Optional **`--config PATH`** -- Path to resolver config YAML (default: `infra/config/resolver.yaml`) +- Path to resolver config YAML (default: `config/resolver.yaml`) - Determines blocking rules, thresholds, and Splink settings **`--seed N`** diff --git a/test/stress/stress_test.py b/test/stress/stress_test.py index 4588c63..14cbb72 100644 --- a/test/stress/stress_test.py +++ b/test/stress/stress_test.py @@ -14,7 +14,7 @@ --dataset test/stress/data/org-mid.csv \ --seed 200 \ --records 500 \ - --config infra/config/resolver.yaml \ + --config config/resolver.yaml \ --output /tmp/stress_mid.json """ @@ -141,7 +141,10 @@ def create_resolver( def seed_and_train( - resolver: EntityResolver, mentions: list[Mention], n_seed: int, skip_train: bool = False + resolver: EntityResolver, + mentions: list[Mention], + n_seed: int, + skip_train: bool = False, ): """ Seed resolver with first n_seed mentions and optionally trigger training. @@ -409,7 +412,7 @@ def main(): ) parser.add_argument( "--config", - default="infra/config/resolver.yaml", + default="config/resolver.yaml", help="Path to resolver config YAML", ) parser.add_argument( From 9332da6246313727a9289a9087ed83912d4af23d Mon Sep 17 00:00:00 2001 From: Grzegorz Kostkowski Date: Fri, 27 Mar 2026 13:15:03 +0100 Subject: [PATCH 07/14] chore: update line endings in the demo script --- demo/demo.py | 1126 +++++++++++++++++++++++++------------------------- 1 file changed, 563 insertions(+), 563 deletions(-) diff --git a/demo/demo.py b/demo/demo.py index f8d33bf..1a06939 100755 --- a/demo/demo.py +++ b/demo/demo.py @@ -1,563 +1,563 @@ -#!/usr/bin/env python3 -""" -Demo: Indirect Redis client for ERE (Entity Resolution Engine). - -This demo connects to ERE through the Redis queue infrastructure (no direct Python API). -It demonstrates: -1. Checking Redis connectivity -2. Sending EntityMentionResolutionRequest messages to the queue -3. Listening for EntityMentionResolutionResponse messages -4. Logging all interactions - -The example uses 6 synthetic mentions from ALGORITHM.md that cluster into 2 groups: - - Cluster 1: {1, 2, 5} (organizations with high similarity) - - Cluster 2: {3, 4, 6} (different organizations, also highly similar) - -⚠️ IMPORTANT: The ERE resolver persists state in a DuckDB database volume. - Before running a fresh demo with different data, clear the old database: - - docker volume rm ere-local_ere-data - make infra-rebuild - - Failure to do so will mix old mentions with new ones, corrupting demo results. -""" - -import json -import logging -import os -import sys -import time -from datetime import datetime, timezone -from pathlib import Path - -import redis - -# Default data file path -DEFAULT_DATA_FILE = Path(__file__).parent / "data" / "org-tiny.json" - -DELAY_BETWEEN_MESSAGES = ( - 0 # seconds to wait between sending messages (set to >0 for sequential processing) -) -GLOBAL_TIMEOUT = 0 # seconds to wait for responses before giving up (0 = no timeout) - - -# =============================================================================== -# Configuration -# =============================================================================== - - -def load_env_file(env_path: str = None) -> dict: - """Load configuration from .env or environment variables.""" - config = {} - - # Try to load from .env if it exists - if env_path is None: - env_path = Path(__file__).parent.parent / "infra" / ".env" - - if Path(env_path).exists(): - with open(env_path) as f: - for line in f: - line = line.strip() - if line and not line.startswith("#"): - if "=" in line: - key, value = line.split("=", 1) - config[key.strip()] = value.strip() - - # Environment variables override .env - config["REDIS_HOST"] = os.environ.get( - "REDIS_HOST", config.get("REDIS_HOST", "localhost") - ) - config["REDIS_PORT"] = int( - os.environ.get("REDIS_PORT", config.get("REDIS_PORT", "6379")) - ) - config["REDIS_DB"] = int(os.environ.get("REDIS_DB", config.get("REDIS_DB", "0"))) - config["REDIS_PASSWORD"] = os.environ.get( - "REDIS_PASSWORD", config.get("REDIS_PASSWORD") - ) - config["REQUEST_QUEUE"] = os.environ.get( - "REQUEST_QUEUE", config.get("REQUEST_QUEUE", "ere_requests") - ) - config["RESPONSE_QUEUE"] = os.environ.get( - "RESPONSE_QUEUE", config.get("RESPONSE_QUEUE", "ere_responses") - ) - - return config - - -# =============================================================================== -# Logging Setup -# =============================================================================== - -TRACE = 5 - - -def setup_logging(): - """Configure logging with timestamps.""" - log_level_name = os.environ.get("LOG_LEVEL", "INFO").upper() - - # Handle custom TRACE level - if log_level_name == "TRACE": - log_level = TRACE - logging.addLevelName(TRACE, "TRACE") - else: - log_level = getattr(logging, log_level_name, logging.INFO) - - logging.basicConfig( - level=log_level, - format="%(asctime)s [%(levelname)s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - - logger = logging.getLogger(__name__) - logger.setLevel(log_level) - logger.info(f"Logging configured at level {log_level_name}") - - return logger - - -# =============================================================================== -# Redis Connection -# =============================================================================== - - -def check_redis_connectivity( - host: str, port: int, db: int, password: str -) -> redis.Redis: - """ - Check Redis connectivity and return client. - - Attempts connection to specified host first, then fallback to localhost - if configured host is "redis" (Docker). - - Raises: - RuntimeError: If Redis is not accessible. - """ - hosts_to_try = [host] - - # Fallback: if configured host is "redis" (Docker), also try localhost - if host == "redis": - hosts_to_try.append("localhost") - - last_error = None - for try_host in hosts_to_try: - try: - logging.getLogger(__name__).info( - f"Attempting Redis connection to {try_host}:{port}..." - ) - client = redis.Redis( - host=try_host, - port=port, - db=db, - password=password, - decode_responses=False, - ) - client.ping() - return client - except Exception as e: - last_error = e - continue - - raise RuntimeError( - f"Redis unavailable. Tried hosts: {hosts_to_try}, port: {port}, db: {db}" - ) from last_error - - -# =============================================================================== -# Request/Response Handling -# =============================================================================== - - -def escape_turtle_string(value: str) -> str: - """ - Escape a string for safe inclusion in Turtle RDF format. - - Handles special characters: backslash, double quotes, newlines, carriage returns, tabs. - - Args: - value: String to escape - - Returns: - Escaped string safe for use in Turtle string literals - """ - if not value: - return value - - # Escape backslash first (must be done before other escapes) - value = value.replace("\\", "\\\\") - # Escape double quotes - value = value.replace('"', '\\"') - # Escape newlines - value = value.replace("\n", "\\n") - # Escape carriage returns - value = value.replace("\r", "\\r") - # Escape tabs - value = value.replace("\t", "\\t") - - return value - - -def create_entity_mention_request( - request_id: str, - source_id: str, - entity_type: str, - legal_name: str, - country_code: str, - nuts_code: str | None = None, - post_code: str | None = None, - post_name: str | None = None, - thoroughfare: str | None = None, -) -> dict: - """ - Create an EntityMentionResolutionRequest payload. - - Uses RDF/Turtle format with entity metadata including extended address fields. - All string values are properly escaped for Turtle compatibility. - - Args: - request_id: Unique request identifier - source_id: Source system identifier - entity_type: Entity type (e.g., ORGANISATION) - legal_name: Legal name of the entity - country_code: ISO 2-letter country code - nuts_code: Optional NUTS regional code - post_code: Optional postal code - post_name: Optional city/locality name - thoroughfare: Optional street address - """ - # Escape all string values for Turtle safety - legal_name_safe = escape_turtle_string(legal_name or "") - country_code_safe = escape_turtle_string(country_code or "") - - # Build address properties dynamically - address_props = [f'epo:hasCountryCode "{country_code_safe}"'] - if nuts_code: - nuts_code_safe = escape_turtle_string(nuts_code) - address_props.append(f'epo:hasNutsCode "{nuts_code_safe}"') - if post_code: - post_code_safe = escape_turtle_string(post_code) - address_props.append(f'locn:postCode "{post_code_safe}"') - if post_name: - post_name_safe = escape_turtle_string(post_name) - address_props.append(f'locn:postName "{post_name_safe}"') - if thoroughfare: - thoroughfare_safe = escape_turtle_string(thoroughfare) - address_props.append(f'locn:thoroughfare "{thoroughfare_safe}"') - - address_content = " ;\n ".join(address_props) - - content = f"""@prefix org: . -@prefix cccev: . -@prefix epo: . -@prefix locn: . -@prefix epd: . - -epd:ent{request_id} a org:Organization ; - epo:hasLegalName "{legal_name_safe}" ; - cccev:registeredAddress [ - {address_content} - ] . -""" - - return { - "type": "EntityMentionResolutionRequest", - "entity_mention": { - "identifiedBy": { - "request_id": request_id, - "source_id": source_id, - "entity_type": entity_type, - }, - "content": content.strip(), - "content_type": "text/turtle", - }, - "timestamp": datetime.now(timezone.utc).isoformat(), - "ere_request_id": f"{request_id}:01", - } - - -def parse_response(response_bytes: bytes) -> dict: - """Parse JSON response from Redis.""" - return json.loads(response_bytes.decode("utf-8")) - - -# =============================================================================== -# Demo Data Loading -# =============================================================================== - - -def load_demo_mentions(data_file: str | None = None) -> list[dict]: - """ - Load demo mentions from a JSON file. - - Args: - data_file: Path to JSON file containing mentions. If None, uses default. - - Returns: - List of mention dicts with keys: request_id, source_id, entity_type, - legal_name, country_code, description. - - Raises: - FileNotFoundError: If data file does not exist. - ValueError: If JSON is invalid or missing 'mentions' key. - """ - if data_file is None: - data_file = DEFAULT_DATA_FILE - - data_path = Path(data_file) - if not data_path.exists(): - raise FileNotFoundError(f"Data file not found: {data_path}") - - with open(data_path) as f: - data = json.load(f) - - if "mentions" not in data: - raise ValueError(f"JSON must contain 'mentions' key") - - return data["mentions"] - - -# =============================================================================== -# Main Demo -# =============================================================================== - - -def main(data_file: str | None = None): - """ - Run the Redis-based ERE demo. - - Args: - data_file: Path to JSON file containing demo mentions. - If None, uses default (mentions_mixed_countries.json). - """ - logger = setup_logging() - - # Load configuration - logger.info("Loading configuration...") - config = load_env_file() - logger.info( - f"Redis config: host={config['REDIS_HOST']}, " - f"port={config['REDIS_PORT']}, db={config['REDIS_DB']}" - ) - logger.info( - f"Queue names: request={config['REQUEST_QUEUE']}, " - f"response={config['RESPONSE_QUEUE']}" - ) - - # Load demo mentions from JSON - try: - demo_mentions = load_demo_mentions(data_file) - logger.info( - f"Loaded {len(demo_mentions)} mentions from {data_file or DEFAULT_DATA_FILE}" - ) - except (FileNotFoundError, ValueError) as e: - logger.error(f"Failed to load demo mentions: {e}") - return 1 - - # Check Redis connectivity - logger.info("Checking Redis connectivity...") - try: - redis_client = check_redis_connectivity( - host=config["REDIS_HOST"], - port=config["REDIS_PORT"], - db=config["REDIS_DB"], - password=config["REDIS_PASSWORD"], - ) - logger.info("✓ Redis is available") - except RuntimeError as e: - logger.error(f"✗ Redis check failed: {e}") - return 1 - - # Clear queues - logger.info("Clearing request and response queues...") - redis_client.delete(config["REQUEST_QUEUE"], config["RESPONSE_QUEUE"]) - - # ⚠️ Check if DuckDB database is non-empty (stale from prior runs) - # This guards against corrupting demo results by mixing old and new mentions - duckdb_path = Path(os.environ.get("DUCKDB_PATH", "/data/app.duckdb")) - if duckdb_path.exists() and duckdb_path.stat().st_size > 0: - logger.warning( - f"⚠️ WARNING: DuckDB database file exists and is non-empty!\n" - f" This may contain mentions from a prior run.\n" - f" This will CORRUPT demo results by mixing old and new data.\n" - f" \n" - f" To reset the database:\n" - f" 1. docker volume rm ere-local_ere-data\n" - f" 2. make infra-rebuild\n" - ) - - # Send demo requests - logger.info(f"Sending {len(demo_mentions)} entity mentions...") - request_ids = [] - - for mention in demo_mentions: - request = create_entity_mention_request( - request_id=mention["request_id"], - source_id=mention["source_id"], - entity_type=mention["entity_type"], - legal_name=mention["legal_name"], - country_code=mention["country_code"], - nuts_code=mention.get("nuts_code"), - post_code=mention.get("post_code"), - post_name=mention.get("post_name"), - thoroughfare=mention.get("thoroughfare"), - ) - - message_json = json.dumps(request) - if logger.isEnabledFor(TRACE): - logger.log(TRACE, f"Full request message:\n{json.dumps(request, indent=2)}") - - message_bytes = message_json.encode("utf-8") - redis_client.rpush(config["REQUEST_QUEUE"], message_bytes) - request_ids.append(mention["request_id"]) - - logger.info( - f" → Sent request {mention['request_id']}: " - f"{mention['legal_name']} ({mention['country_code']}) " - f"[{mention.get('description', '')}]" - ) - - # Wait 1 second between messages to ensure sequential processing - if DELAY_BETWEEN_MESSAGES: - time.sleep(1) - - logger.info("") - logger.info("Listening for responses...") - logger.info("-" * 80) - - # Track mentions for summary: map request_id → (legal_name, cluster_id) - mention_tracking = {} - for mention in demo_mentions: - mention_tracking[mention["request_id"]] = { - "legal_name": mention["legal_name"], - "cluster_id": None, # Will be filled in from response - } - - # Listen for responses - responses_received = {} - start_time = time.time() - - while len(responses_received) < len(request_ids): - elapsed = time.time() - start_time - if GLOBAL_TIMEOUT > 0 and elapsed > GLOBAL_TIMEOUT: - logger.warning( - f"Timeout after {GLOBAL_TIMEOUT}s. Received {len(responses_received)}/{len(request_ids)} responses." - ) - break - - # Try to get a response with short timeout - result = redis_client.brpop(config["RESPONSE_QUEUE"], timeout=1) - - if result is not None: - _, response_bytes = result - response = parse_response(response_bytes) - - if logger.isEnabledFor(TRACE): - logger.log( - TRACE, f"Full response message:\n{json.dumps(response, indent=2)}" - ) - - req_id = response["entity_mention_id"]["request_id"] - responses_received[req_id] = response - - logger.info(f"\n✓ Response received for {req_id}:") - logger.info(f" Type: {response['type']}") - logger.info(f" Timestamp: {response['timestamp']}") - - source_id = response["entity_mention_id"]["source_id"] - entity_type = response["entity_mention_id"]["entity_type"] - logger.info(f" Mention: ({source_id}, {req_id}, {entity_type})") - - logger.info(f" Candidates:") - - # Track the top cluster assignment (first candidate is the assignment) - if response.get("candidates"): - top_candidate = response["candidates"][0] - assigned_cluster = top_candidate["cluster_id"] - mention_tracking[req_id]["cluster_id"] = assigned_cluster - logger.info(f" → Assigned to cluster: {assigned_cluster}") - - for i, candidate in enumerate(response.get("candidates", []), 1): - logger.info( - f" {i}. Cluster {candidate['cluster_id']}: " - f"confidence={candidate['confidence_score']:.4f}, " - f"similarity={candidate['similarity_score']:.4f}" - ) - - logger.info("-" * 80) - logger.info( - f"\nDemo complete. Received {len(responses_received)}/{len(request_ids)} responses." - ) - - # Build clustering summary as single block - summary_lines = [] - summary_lines.append("=" * 80) - summary_lines.append("CLUSTERING SUMMARY") - summary_lines.append("=" * 80) - - # Group mentions by assigned cluster - clusters = {} - unassigned = [] - - for req_id in request_ids: - tracking = mention_tracking.get(req_id) - if tracking: - cluster_id = tracking["cluster_id"] - legal_name = tracking["legal_name"] - - if cluster_id is None: - unassigned.append((req_id, legal_name)) - else: - if cluster_id not in clusters: - clusters[cluster_id] = [] - clusters[cluster_id].append((req_id, legal_name)) - - # Build cluster output - if clusters: - for cluster_id in sorted(clusters.keys()): - members = clusters[cluster_id] - summary_lines.append("") - summary_lines.append(f"{cluster_id} ({len(members)} members):") - for req_id, legal_name in members: - summary_lines.append(f" {req_id:4s} | {legal_name}") - else: - summary_lines.append("") - summary_lines.append("(No clusters formed)") - - # Add unassigned mentions - if unassigned: - summary_lines.append("") - summary_lines.append(f"Unassigned ({len(unassigned)} mentions):") - for req_id, legal_name in unassigned: - summary_lines.append(f" {req_id:4s} | {legal_name}") - - summary_lines.append("=" * 80) - - # Print entire summary in one log call - summary_block = "\n".join(summary_lines) - logger.info(f"\n{summary_block}") - - # Summary - if len(responses_received) == len(request_ids): - logger.info("✓ All responses received successfully!") - return 0 - else: - logger.warning( - f"✗ Missing {len(request_ids) - len(responses_received)} response(s)." - ) - return 1 - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser( - description="Redis-based ERE demo with parametrized mentions data." - ) - parser.add_argument( - "--data", - type=str, - default=None, - help=f"Path to JSON file with demo mentions (default: {DEFAULT_DATA_FILE})", - ) - args = parser.parse_args() - - sys.exit(main(data_file=args.data)) +#!/usr/bin/env python3 +""" +Demo: Indirect Redis client for ERE (Entity Resolution Engine). + +This demo connects to ERE through the Redis queue infrastructure (no direct Python API). +It demonstrates: +1. Checking Redis connectivity +2. Sending EntityMentionResolutionRequest messages to the queue +3. Listening for EntityMentionResolutionResponse messages +4. Logging all interactions + +The example uses 6 synthetic mentions from ALGORITHM.md that cluster into 2 groups: + - Cluster 1: {1, 2, 5} (organizations with high similarity) + - Cluster 2: {3, 4, 6} (different organizations, also highly similar) + +⚠️ IMPORTANT: The ERE resolver persists state in a DuckDB database volume. + Before running a fresh demo with different data, clear the old database: + + docker volume rm ere-local_ere-data + make infra-rebuild + + Failure to do so will mix old mentions with new ones, corrupting demo results. +""" + +import json +import logging +import os +import sys +import time +from datetime import datetime, timezone +from pathlib import Path + +import redis + +# Default data file path +DEFAULT_DATA_FILE = Path(__file__).parent / "data" / "org-tiny.json" + +DELAY_BETWEEN_MESSAGES = ( + 0 # seconds to wait between sending messages (set to >0 for sequential processing) +) +GLOBAL_TIMEOUT = 0 # seconds to wait for responses before giving up (0 = no timeout) + + +# =============================================================================== +# Configuration +# =============================================================================== + + +def load_env_file(env_path: str = None) -> dict: + """Load configuration from .env or environment variables.""" + config = {} + + # Try to load from .env if it exists + if env_path is None: + env_path = Path(__file__).parent.parent / "infra" / ".env" + + if Path(env_path).exists(): + with open(env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + if "=" in line: + key, value = line.split("=", 1) + config[key.strip()] = value.strip() + + # Environment variables override .env + config["REDIS_HOST"] = os.environ.get( + "REDIS_HOST", config.get("REDIS_HOST", "localhost") + ) + config["REDIS_PORT"] = int( + os.environ.get("REDIS_PORT", config.get("REDIS_PORT", "6379")) + ) + config["REDIS_DB"] = int(os.environ.get("REDIS_DB", config.get("REDIS_DB", "0"))) + config["REDIS_PASSWORD"] = os.environ.get( + "REDIS_PASSWORD", config.get("REDIS_PASSWORD") + ) + config["REQUEST_QUEUE"] = os.environ.get( + "REQUEST_QUEUE", config.get("REQUEST_QUEUE", "ere_requests") + ) + config["RESPONSE_QUEUE"] = os.environ.get( + "RESPONSE_QUEUE", config.get("RESPONSE_QUEUE", "ere_responses") + ) + + return config + + +# =============================================================================== +# Logging Setup +# =============================================================================== + +TRACE = 5 + + +def setup_logging(): + """Configure logging with timestamps.""" + log_level_name = os.environ.get("LOG_LEVEL", "INFO").upper() + + # Handle custom TRACE level + if log_level_name == "TRACE": + log_level = TRACE + logging.addLevelName(TRACE, "TRACE") + else: + log_level = getattr(logging, log_level_name, logging.INFO) + + logging.basicConfig( + level=log_level, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + + logger = logging.getLogger(__name__) + logger.setLevel(log_level) + logger.info(f"Logging configured at level {log_level_name}") + + return logger + + +# =============================================================================== +# Redis Connection +# =============================================================================== + + +def check_redis_connectivity( + host: str, port: int, db: int, password: str +) -> redis.Redis: + """ + Check Redis connectivity and return client. + + Attempts connection to specified host first, then fallback to localhost + if configured host is "redis" (Docker). + + Raises: + RuntimeError: If Redis is not accessible. + """ + hosts_to_try = [host] + + # Fallback: if configured host is "redis" (Docker), also try localhost + if host == "redis": + hosts_to_try.append("localhost") + + last_error = None + for try_host in hosts_to_try: + try: + logging.getLogger(__name__).info( + f"Attempting Redis connection to {try_host}:{port}..." + ) + client = redis.Redis( + host=try_host, + port=port, + db=db, + password=password, + decode_responses=False, + ) + client.ping() + return client + except Exception as e: + last_error = e + continue + + raise RuntimeError( + f"Redis unavailable. Tried hosts: {hosts_to_try}, port: {port}, db: {db}" + ) from last_error + + +# =============================================================================== +# Request/Response Handling +# =============================================================================== + + +def escape_turtle_string(value: str) -> str: + """ + Escape a string for safe inclusion in Turtle RDF format. + + Handles special characters: backslash, double quotes, newlines, carriage returns, tabs. + + Args: + value: String to escape + + Returns: + Escaped string safe for use in Turtle string literals + """ + if not value: + return value + + # Escape backslash first (must be done before other escapes) + value = value.replace("\\", "\\\\") + # Escape double quotes + value = value.replace('"', '\\"') + # Escape newlines + value = value.replace("\n", "\\n") + # Escape carriage returns + value = value.replace("\r", "\\r") + # Escape tabs + value = value.replace("\t", "\\t") + + return value + + +def create_entity_mention_request( + request_id: str, + source_id: str, + entity_type: str, + legal_name: str, + country_code: str, + nuts_code: str | None = None, + post_code: str | None = None, + post_name: str | None = None, + thoroughfare: str | None = None, +) -> dict: + """ + Create an EntityMentionResolutionRequest payload. + + Uses RDF/Turtle format with entity metadata including extended address fields. + All string values are properly escaped for Turtle compatibility. + + Args: + request_id: Unique request identifier + source_id: Source system identifier + entity_type: Entity type (e.g., ORGANISATION) + legal_name: Legal name of the entity + country_code: ISO 2-letter country code + nuts_code: Optional NUTS regional code + post_code: Optional postal code + post_name: Optional city/locality name + thoroughfare: Optional street address + """ + # Escape all string values for Turtle safety + legal_name_safe = escape_turtle_string(legal_name or "") + country_code_safe = escape_turtle_string(country_code or "") + + # Build address properties dynamically + address_props = [f'epo:hasCountryCode "{country_code_safe}"'] + if nuts_code: + nuts_code_safe = escape_turtle_string(nuts_code) + address_props.append(f'epo:hasNutsCode "{nuts_code_safe}"') + if post_code: + post_code_safe = escape_turtle_string(post_code) + address_props.append(f'locn:postCode "{post_code_safe}"') + if post_name: + post_name_safe = escape_turtle_string(post_name) + address_props.append(f'locn:postName "{post_name_safe}"') + if thoroughfare: + thoroughfare_safe = escape_turtle_string(thoroughfare) + address_props.append(f'locn:thoroughfare "{thoroughfare_safe}"') + + address_content = " ;\n ".join(address_props) + + content = f"""@prefix org: . +@prefix cccev: . +@prefix epo: . +@prefix locn: . +@prefix epd: . + +epd:ent{request_id} a org:Organization ; + epo:hasLegalName "{legal_name_safe}" ; + cccev:registeredAddress [ + {address_content} + ] . +""" + + return { + "type": "EntityMentionResolutionRequest", + "entity_mention": { + "identifiedBy": { + "request_id": request_id, + "source_id": source_id, + "entity_type": entity_type, + }, + "content": content.strip(), + "content_type": "text/turtle", + }, + "timestamp": datetime.now(timezone.utc).isoformat(), + "ere_request_id": f"{request_id}:01", + } + + +def parse_response(response_bytes: bytes) -> dict: + """Parse JSON response from Redis.""" + return json.loads(response_bytes.decode("utf-8")) + + +# =============================================================================== +# Demo Data Loading +# =============================================================================== + + +def load_demo_mentions(data_file: str | None = None) -> list[dict]: + """ + Load demo mentions from a JSON file. + + Args: + data_file: Path to JSON file containing mentions. If None, uses default. + + Returns: + List of mention dicts with keys: request_id, source_id, entity_type, + legal_name, country_code, description. + + Raises: + FileNotFoundError: If data file does not exist. + ValueError: If JSON is invalid or missing 'mentions' key. + """ + if data_file is None: + data_file = DEFAULT_DATA_FILE + + data_path = Path(data_file) + if not data_path.exists(): + raise FileNotFoundError(f"Data file not found: {data_path}") + + with open(data_path) as f: + data = json.load(f) + + if "mentions" not in data: + raise ValueError(f"JSON must contain 'mentions' key") + + return data["mentions"] + + +# =============================================================================== +# Main Demo +# =============================================================================== + + +def main(data_file: str | None = None): + """ + Run the Redis-based ERE demo. + + Args: + data_file: Path to JSON file containing demo mentions. + If None, uses default (mentions_mixed_countries.json). + """ + logger = setup_logging() + + # Load configuration + logger.info("Loading configuration...") + config = load_env_file() + logger.info( + f"Redis config: host={config['REDIS_HOST']}, " + f"port={config['REDIS_PORT']}, db={config['REDIS_DB']}" + ) + logger.info( + f"Queue names: request={config['REQUEST_QUEUE']}, " + f"response={config['RESPONSE_QUEUE']}" + ) + + # Load demo mentions from JSON + try: + demo_mentions = load_demo_mentions(data_file) + logger.info( + f"Loaded {len(demo_mentions)} mentions from {data_file or DEFAULT_DATA_FILE}" + ) + except (FileNotFoundError, ValueError) as e: + logger.error(f"Failed to load demo mentions: {e}") + return 1 + + # Check Redis connectivity + logger.info("Checking Redis connectivity...") + try: + redis_client = check_redis_connectivity( + host=config["REDIS_HOST"], + port=config["REDIS_PORT"], + db=config["REDIS_DB"], + password=config["REDIS_PASSWORD"], + ) + logger.info("✓ Redis is available") + except RuntimeError as e: + logger.error(f"✗ Redis check failed: {e}") + return 1 + + # Clear queues + logger.info("Clearing request and response queues...") + redis_client.delete(config["REQUEST_QUEUE"], config["RESPONSE_QUEUE"]) + + # ⚠️ Check if DuckDB database is non-empty (stale from prior runs) + # This guards against corrupting demo results by mixing old and new mentions + duckdb_path = Path(os.environ.get("DUCKDB_PATH", "/data/app.duckdb")) + if duckdb_path.exists() and duckdb_path.stat().st_size > 0: + logger.warning( + f"⚠️ WARNING: DuckDB database file exists and is non-empty!\n" + f" This may contain mentions from a prior run.\n" + f" This will CORRUPT demo results by mixing old and new data.\n" + f" \n" + f" To reset the database:\n" + f" 1. docker volume rm ere-local_ere-data\n" + f" 2. make infra-rebuild\n" + ) + + # Send demo requests + logger.info(f"Sending {len(demo_mentions)} entity mentions...") + request_ids = [] + + for mention in demo_mentions: + request = create_entity_mention_request( + request_id=mention["request_id"], + source_id=mention["source_id"], + entity_type=mention["entity_type"], + legal_name=mention["legal_name"], + country_code=mention["country_code"], + nuts_code=mention.get("nuts_code"), + post_code=mention.get("post_code"), + post_name=mention.get("post_name"), + thoroughfare=mention.get("thoroughfare"), + ) + + message_json = json.dumps(request) + if logger.isEnabledFor(TRACE): + logger.log(TRACE, f"Full request message:\n{json.dumps(request, indent=2)}") + + message_bytes = message_json.encode("utf-8") + redis_client.rpush(config["REQUEST_QUEUE"], message_bytes) + request_ids.append(mention["request_id"]) + + logger.info( + f" → Sent request {mention['request_id']}: " + f"{mention['legal_name']} ({mention['country_code']}) " + f"[{mention.get('description', '')}]" + ) + + # Wait 1 second between messages to ensure sequential processing + if DELAY_BETWEEN_MESSAGES: + time.sleep(1) + + logger.info("") + logger.info("Listening for responses...") + logger.info("-" * 80) + + # Track mentions for summary: map request_id → (legal_name, cluster_id) + mention_tracking = {} + for mention in demo_mentions: + mention_tracking[mention["request_id"]] = { + "legal_name": mention["legal_name"], + "cluster_id": None, # Will be filled in from response + } + + # Listen for responses + responses_received = {} + start_time = time.time() + + while len(responses_received) < len(request_ids): + elapsed = time.time() - start_time + if GLOBAL_TIMEOUT > 0 and elapsed > GLOBAL_TIMEOUT: + logger.warning( + f"Timeout after {GLOBAL_TIMEOUT}s. Received {len(responses_received)}/{len(request_ids)} responses." + ) + break + + # Try to get a response with short timeout + result = redis_client.brpop(config["RESPONSE_QUEUE"], timeout=1) + + if result is not None: + _, response_bytes = result + response = parse_response(response_bytes) + + if logger.isEnabledFor(TRACE): + logger.log( + TRACE, f"Full response message:\n{json.dumps(response, indent=2)}" + ) + + req_id = response["entity_mention_id"]["request_id"] + responses_received[req_id] = response + + logger.info(f"\n✓ Response received for {req_id}:") + logger.info(f" Type: {response['type']}") + logger.info(f" Timestamp: {response['timestamp']}") + + source_id = response["entity_mention_id"]["source_id"] + entity_type = response["entity_mention_id"]["entity_type"] + logger.info(f" Mention: ({source_id}, {req_id}, {entity_type})") + + logger.info(f" Candidates:") + + # Track the top cluster assignment (first candidate is the assignment) + if response.get("candidates"): + top_candidate = response["candidates"][0] + assigned_cluster = top_candidate["cluster_id"] + mention_tracking[req_id]["cluster_id"] = assigned_cluster + logger.info(f" → Assigned to cluster: {assigned_cluster}") + + for i, candidate in enumerate(response.get("candidates", []), 1): + logger.info( + f" {i}. Cluster {candidate['cluster_id']}: " + f"confidence={candidate['confidence_score']:.4f}, " + f"similarity={candidate['similarity_score']:.4f}" + ) + + logger.info("-" * 80) + logger.info( + f"\nDemo complete. Received {len(responses_received)}/{len(request_ids)} responses." + ) + + # Build clustering summary as single block + summary_lines = [] + summary_lines.append("=" * 80) + summary_lines.append("CLUSTERING SUMMARY") + summary_lines.append("=" * 80) + + # Group mentions by assigned cluster + clusters = {} + unassigned = [] + + for req_id in request_ids: + tracking = mention_tracking.get(req_id) + if tracking: + cluster_id = tracking["cluster_id"] + legal_name = tracking["legal_name"] + + if cluster_id is None: + unassigned.append((req_id, legal_name)) + else: + if cluster_id not in clusters: + clusters[cluster_id] = [] + clusters[cluster_id].append((req_id, legal_name)) + + # Build cluster output + if clusters: + for cluster_id in sorted(clusters.keys()): + members = clusters[cluster_id] + summary_lines.append("") + summary_lines.append(f"{cluster_id} ({len(members)} members):") + for req_id, legal_name in members: + summary_lines.append(f" {req_id:4s} | {legal_name}") + else: + summary_lines.append("") + summary_lines.append("(No clusters formed)") + + # Add unassigned mentions + if unassigned: + summary_lines.append("") + summary_lines.append(f"Unassigned ({len(unassigned)} mentions):") + for req_id, legal_name in unassigned: + summary_lines.append(f" {req_id:4s} | {legal_name}") + + summary_lines.append("=" * 80) + + # Print entire summary in one log call + summary_block = "\n".join(summary_lines) + logger.info(f"\n{summary_block}") + + # Summary + if len(responses_received) == len(request_ids): + logger.info("✓ All responses received successfully!") + return 0 + else: + logger.warning( + f"✗ Missing {len(request_ids) - len(responses_received)} response(s)." + ) + return 1 + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Redis-based ERE demo with parametrized mentions data." + ) + parser.add_argument( + "--data", + type=str, + default=None, + help=f"Path to JSON file with demo mentions (default: {DEFAULT_DATA_FILE})", + ) + args = parser.parse_args() + + sys.exit(main(data_file=args.data)) From def5ab0253afe5553a11bbbe5c4af9aef6988a65 Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 16:42:10 +0200 Subject: [PATCH 08/14] using develop branch of ers-spec for now --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7520dbf..e4e5579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ pandas = ">=2.0,<3.0" splink = ">=4.0,<5.0" # TODO: should we have a registry? -ers-spec = { git = "https://github.com/OP-TED/entity-resolution-spec.git", branch = "0.3.0-rc.1" } +ers-spec = { git = "https://github.com/OP-TED/entity-resolution-spec.git", branch = "develop" } [tool.pytest.ini_options] From c0607a699741b32149a4de5f6234874f70fc2cc7 Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 17:59:25 +0200 Subject: [PATCH 09/14] test: add unit tests to reach 85% coverage threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the SonarQube quality gate failure from PR#21 (69% coverage on new code). New test modules: - test/unit/adapters/test_utils.py: message parsing (get_request/response_from_message) - test/unit/adapters/test_adapter_factories.py: build_rdf_mapper factory - test/unit/entrypoints/test_queue_worker.py: RedisQueueWorker with mocked Redis - test/unit/utils/test_logging.py: configure_logging and TRACE level - test/unit/services/test_services_factories.py: build_entity_resolver (in-memory + persistent DuckDB) - test/unit/test_models.py: MentionLink/ResolutionResult edge cases + app.main() failure paths Extended test files: - stubs.py: add StubRDFMapper and find_by_id to InMemoryMentionRepository - test_entity_resolution_service.py: EntityResolutionService process_request paths - test_duckdb_adapters.py: load_all and save_all([]) coverage Coverage: 61% → 85% (unit + BDD combined) --- test/unit/adapters/stubs.py | 31 +++++ test/unit/adapters/test_adapter_factories.py | 18 +++ test/unit/adapters/test_duckdb_adapters.py | 26 ++++ test/unit/adapters/test_utils.py | 74 +++++++++++ test/unit/entrypoints/__init__.py | 0 test/unit/entrypoints/test_queue_worker.py | 124 ++++++++++++++++++ .../test_entity_resolution_service.py | 119 ++++++++++++++++- test/unit/services/test_services_factories.py | 67 ++++++++++ test/unit/test_models.py | 111 ++++++++++++++++ test/unit/utils/__init__.py | 0 test/unit/utils/test_logging.py | 54 ++++++++ 11 files changed, 622 insertions(+), 2 deletions(-) create mode 100644 test/unit/adapters/test_adapter_factories.py create mode 100644 test/unit/adapters/test_utils.py create mode 100644 test/unit/entrypoints/__init__.py create mode 100644 test/unit/entrypoints/test_queue_worker.py create mode 100644 test/unit/services/test_services_factories.py create mode 100644 test/unit/test_models.py create mode 100644 test/unit/utils/__init__.py create mode 100644 test/unit/utils/test_logging.py diff --git a/test/unit/adapters/stubs.py b/test/unit/adapters/stubs.py index 2b7741b..d2054bc 100644 --- a/test/unit/adapters/stubs.py +++ b/test/unit/adapters/stubs.py @@ -2,6 +2,9 @@ from typing import Protocol, runtime_checkable +from erspec.models.core import EntityMention + +from ere.adapters.rdf_mapper_port import RDFMapper from ere.models.resolver import ( ClusterId, ClusterMembership, @@ -78,6 +81,9 @@ def save(self, mention: Mention) -> None: def load_all(self) -> list[Mention]: return list(self._mentions.values()) + def find_by_id(self, mention_id: MentionId) -> Mention | None: + return self._mentions.get(mention_id) + def count(self) -> int: return len(self._mentions) @@ -193,3 +199,28 @@ def register_mention(self, mention: Mention) -> None: def train(self) -> None: """No-op for fixed linker (scores are pre-configured).""" pass + + +class StubRDFMapper(RDFMapper): + """ + RDFMapper stub for unit testing. + + Returns a pre-configured Mention without performing any RDF parsing. + Optionally raises a configured exception to test error paths. + """ + + def __init__( + self, + mention_to_return: Mention = None, + error: Exception = None, + ): + self._mention = mention_to_return or Mention( + id=MentionId(value="stub-mention-id"), + attributes={"legal_name": "Stub Corp", "country_code": "US"}, + ) + self._error = error + + def map_entity_mention_to_domain(self, entity_mention: EntityMention) -> Mention: + if self._error is not None: + raise self._error + return self._mention diff --git a/test/unit/adapters/test_adapter_factories.py b/test/unit/adapters/test_adapter_factories.py new file mode 100644 index 0000000..e339df4 --- /dev/null +++ b/test/unit/adapters/test_adapter_factories.py @@ -0,0 +1,18 @@ +"""Unit tests for adapters.factories: RDFMapper construction.""" + +from pathlib import Path + +from ere.adapters.factories import build_rdf_mapper +from ere.adapters.rdf_mapper_port import RDFMapper + +TEST_RDF_MAPPING = Path(__file__).parent.parent.parent / "resources" / "rdf_mapping.yaml" + + +def test_build_rdf_mapper_with_explicit_path_returns_mapper(): + mapper = build_rdf_mapper(rdf_mapping_path=TEST_RDF_MAPPING) + assert isinstance(mapper, RDFMapper) + + +def test_build_rdf_mapper_without_path_uses_default(): + mapper = build_rdf_mapper() + assert isinstance(mapper, RDFMapper) diff --git a/test/unit/adapters/test_duckdb_adapters.py b/test/unit/adapters/test_duckdb_adapters.py index 03f5b79..fa6cb4b 100644 --- a/test/unit/adapters/test_duckdb_adapters.py +++ b/test/unit/adapters/test_duckdb_adapters.py @@ -244,3 +244,29 @@ def test_cluster_membership_mapping(service, con): assert len(memberships[cluster_id]) == 2 assert MentionId(value="m1") in memberships[cluster_id] assert MentionId(value="m2") in memberships[cluster_id] + + +def test_mention_repository_load_all_returns_persisted_mentions(con, entity_fields): + """load_all should return all mentions previously saved.""" + repo = DuckDBMentionRepository(con, entity_fields) + m1 = Mention(id=MentionId(value="la1"), attributes={"legal_name": "Alpha", "country_code": "DE"}) + m2 = Mention(id=MentionId(value="la2"), attributes={"legal_name": "Beta", "country_code": "FR"}) + + repo.save(m1) + repo.save(m2) + + loaded = repo.load_all() + + assert len(loaded) == 2 + ids = {m.id.value for m in loaded} + assert ids == {"la1", "la2"} + + +def test_similarity_repository_save_all_empty_is_noop(con): + """save_all with an empty list should not raise and not write any rows.""" + repo = DuckDBSimilarityRepository(con) + + repo.save_all([]) # must not raise + + count = con.execute("SELECT COUNT(*) FROM similarities").fetchone()[0] + assert count == 0 diff --git a/test/unit/adapters/test_utils.py b/test/unit/adapters/test_utils.py new file mode 100644 index 0000000..80fb431 --- /dev/null +++ b/test/unit/adapters/test_utils.py @@ -0,0 +1,74 @@ +"""Unit tests for adapters.utils: message parsing utilities.""" + +import json +from datetime import datetime, timezone + +import pytest +from erspec.models.core import EntityMention, EntityMentionIdentifier +from erspec.models.ere import ( + EREErrorResponse, + EntityMentionResolutionRequest, + EntityMentionResolutionResponse, +) +from linkml_runtime.dumpers import JSONDumper + +from ere.adapters.utils import ( + get_message_object, + get_request_from_message, + get_response_from_message, +) + +_dumper = JSONDumper() + + +def _make_request(request_id: str = "utils-test-001") -> EntityMentionResolutionRequest: + return EntityMentionResolutionRequest( + entity_mention=EntityMention( + identifiedBy=EntityMentionIdentifier( + request_id=request_id, + source_id="utils-test-src", + entity_type="http://test.org/Org", + ), + content_type="text/turtle", + content="<>", + ), + ere_request_id=request_id, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + +def _serialise(obj) -> bytes: + return _dumper.dumps(obj).encode("utf-8") + + +def test_get_request_from_message_returns_request(): + raw = _serialise(_make_request("req-parse-01")) + result = get_request_from_message(raw) + assert isinstance(result, EntityMentionResolutionRequest) + assert result.ere_request_id == "req-parse-01" + + +def test_get_response_from_message_returns_error_response(): + response = EREErrorResponse( + ere_request_id="resp-parse-01", + error_type="TestError", + error_title="Test", + error_detail="detail", + timestamp=datetime.now(timezone.utc).isoformat(), + ) + raw = _serialise(response) + result = get_response_from_message(raw) + assert isinstance(result, EREErrorResponse) + assert result.ere_request_id == "resp-parse-01" + + +def test_get_message_object_raises_on_missing_type(): + raw = json.dumps({"ere_request_id": "no-type"}).encode("utf-8") + with pytest.raises(ValueError, match="message without 'type' field"): + get_message_object(raw, {}) + + +def test_get_message_object_raises_on_unsupported_type(): + raw = json.dumps({"type": "UnknownClass", "ere_request_id": "x"}).encode("utf-8") + with pytest.raises(ValueError, match='unsupported message class: "UnknownClass"'): + get_message_object(raw, {}) diff --git a/test/unit/entrypoints/__init__.py b/test/unit/entrypoints/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unit/entrypoints/test_queue_worker.py b/test/unit/entrypoints/test_queue_worker.py new file mode 100644 index 0000000..b55f143 --- /dev/null +++ b/test/unit/entrypoints/test_queue_worker.py @@ -0,0 +1,124 @@ +"""Unit tests for RedisQueueWorker entrypoint (mocked Redis and service).""" + +import json +from datetime import datetime, timezone +from unittest.mock import MagicMock + +import pytest +from erspec.models.core import EntityMention, EntityMentionIdentifier +from erspec.models.ere import ( + EREErrorResponse, + EntityMentionResolutionRequest, + EntityMentionResolutionResponse, +) +from linkml_runtime.dumpers import JSONDumper + +from ere.entrypoints.queue_worker import RedisQueueWorker + +_dumper = JSONDumper() + + +def _make_request(request_id: str = "qw-test-001") -> EntityMentionResolutionRequest: + return EntityMentionResolutionRequest( + entity_mention=EntityMention( + identifiedBy=EntityMentionIdentifier( + request_id=request_id, + source_id="qw-src", + entity_type="http://test.org/Org", + ), + content_type="text/turtle", + content="<>", + ), + ere_request_id=request_id, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + +def _make_response(request_id: str = "qw-test-001") -> EntityMentionResolutionResponse: + return EntityMentionResolutionResponse( + entity_mention_id=EntityMentionIdentifier( + request_id=request_id, + source_id="qw-src", + entity_type="http://test.org/Org", + ), + candidates=[], + ere_request_id=request_id, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + +@pytest.fixture +def mock_redis(): + return MagicMock() + + +@pytest.fixture +def mock_service(): + return MagicMock() + + +@pytest.fixture +def worker(mock_redis, mock_service) -> RedisQueueWorker: + return RedisQueueWorker( + redis_client=mock_redis, + entity_resolution_service=mock_service, + request_queue="ere_requests", + response_queue="ere_responses", + queue_timeout=1, + ) + + +def test_process_single_message_returns_false_on_timeout(worker, mock_redis): + mock_redis.brpop.return_value = None + + result = worker.process_single_message() + + assert result is False + + +def test_process_single_message_returns_true_on_success(worker, mock_redis, mock_service): + request = _make_request("qw-happy") + raw_msg = _dumper.dumps(request).encode("utf-8") + mock_redis.brpop.return_value = ("ere_requests", raw_msg) + mock_service.process_request.return_value = _make_response("qw-happy") + + result = worker.process_single_message() + + assert result is True + mock_service.process_request.assert_called_once() + mock_redis.lpush.assert_called_once() + + +def test_process_single_message_sends_error_response_on_parse_failure( + worker, mock_redis, mock_service +): + mock_redis.brpop.return_value = ("ere_requests", b"not valid json at all") + + result = worker.process_single_message() + + assert result is True + mock_redis.lpush.assert_called_once() + pushed_payload = mock_redis.lpush.call_args[0][1] + pushed_json = json.loads(pushed_payload) + assert pushed_json.get("error_type") == "ProcessingError" + + +def test_send_response_logs_error_on_redis_failure(worker, mock_redis): + mock_redis.lpush.side_effect = ConnectionError("redis down") + response = EREErrorResponse( + ere_request_id="err-resp", + error_type="TestError", + error_title="Test", + error_detail="detail", + timestamp=datetime.now(timezone.utc).isoformat(), + ) + worker._send_response(response) # must not raise + + +def test_build_error_response_returns_ere_error_response(): + response = RedisQueueWorker._build_error_response("something broke", "req-err") + + assert isinstance(response, EREErrorResponse) + assert response.ere_request_id == "req-err" + assert response.error_type == "ProcessingError" + assert "something broke" in response.error_detail diff --git a/test/unit/services/test_entity_resolution_service.py b/test/unit/services/test_entity_resolution_service.py index 0948f06..617cd42 100644 --- a/test/unit/services/test_entity_resolution_service.py +++ b/test/unit/services/test_entity_resolution_service.py @@ -1,6 +1,14 @@ -"""Unit tests for EntityResolver (no DuckDB, no Splink).""" +"""Unit tests for EntityResolver and EntityResolutionService (no DuckDB, no Splink).""" import pytest +from datetime import datetime, timezone + +from erspec.models.core import EntityMention, EntityMentionIdentifier +from erspec.models.ere import ( + EREErrorResponse, + EntityMentionResolutionRequest, + EntityMentionResolutionResponse, +) from ere.models.resolver import ( ClusterId, @@ -8,13 +16,18 @@ MentionId, MentionLink, ) -from ere.services.entity_resolution_service import EntityResolver +from ere.services.entity_resolution_service import ( + EntityResolutionService, + EntityResolver, + resolve_entity_mention, +) from ere.services.resolver_config import DuckDBConfig, ResolverConfig from test.unit.adapters.stubs import ( FixedSimilarityLinker, InMemoryClusterRepository, InMemoryMentionRepository, InMemorySimilarityRepository, + StubRDFMapper, ) @@ -484,3 +497,105 @@ def test_multiple_independent_clusters(service): state = service.state() assert state.cluster_count == 3 assert state.mention_count == 3 + + +# =============================================================================== +# resolve_entity_mention guard tests +# =============================================================================== + + +def test_resolve_entity_mention_raises_when_resolver_is_none(): + mention = EntityMention( + identifiedBy=EntityMentionIdentifier( + request_id="m1", + source_id="src", + entity_type="http://test.org/Org", + ), + content_type="text/turtle", + content="<>", + ) + with pytest.raises(ValueError, match="resolver must be provided"): + resolve_entity_mention(mention, resolver=None, mapper=StubRDFMapper()) + + +def test_resolve_entity_mention_raises_when_mapper_is_none(service): + mention = EntityMention( + identifiedBy=EntityMentionIdentifier( + request_id="m1", + source_id="src", + entity_type="http://test.org/Org", + ), + content_type="text/turtle", + content="<>", + ) + with pytest.raises(ValueError, match="mapper must be provided"): + resolve_entity_mention(mention, resolver=service, mapper=None) + + +# =============================================================================== +# EntityResolutionService tests +# =============================================================================== + + +@pytest.fixture +def stub_mapper() -> StubRDFMapper: + return StubRDFMapper() + + +@pytest.fixture +def resolution_service(service: EntityResolver, stub_mapper: StubRDFMapper) -> EntityResolutionService: + return EntityResolutionService(resolver=service, mapper=stub_mapper) + + +def _make_request(request_id: str = "req-001") -> EntityMentionResolutionRequest: + return EntityMentionResolutionRequest( + entity_mention=EntityMention( + identifiedBy=EntityMentionIdentifier( + request_id=request_id, + source_id="test-src", + entity_type="http://test.org/Org", + ), + content_type="text/turtle", + content="<>", + ), + ere_request_id=request_id, + timestamp=datetime.now(timezone.utc).isoformat(), + ) + + +def test_process_request_unsupported_type_returns_error_response(resolution_service): + class UnknownRequest: + ere_request_id = "unknown-001" + + response = resolution_service.process_request(UnknownRequest()) + + assert isinstance(response, EREErrorResponse) + assert response.error_type == "UnsupportedRequestType" + + +def test_process_request_happy_path_returns_resolution_response(resolution_service): + request = _make_request("req-happy") + + response = resolution_service.process_request(request) + + assert isinstance(response, EntityMentionResolutionResponse) + assert response.ere_request_id == "req-happy" + assert len(response.candidates) >= 1 + + +def test_process_request_mapper_error_returns_error_response(service: EntityResolver): + failing_mapper = StubRDFMapper(error=ValueError("RDF parse failure")) + svc = EntityResolutionService(resolver=service, mapper=failing_mapper) + + response = svc.process_request(_make_request("req-fail")) + + assert isinstance(response, EREErrorResponse) + assert response.error_type == "ValueError" + assert "RDF parse failure" in response.error_detail + + +def test_call_delegates_to_process_request(resolution_service): + request = _make_request("req-call") + response = resolution_service(request) + assert isinstance(response, EntityMentionResolutionResponse) + assert response.ere_request_id == "req-call" diff --git a/test/unit/services/test_services_factories.py b/test/unit/services/test_services_factories.py new file mode 100644 index 0000000..46398d8 --- /dev/null +++ b/test/unit/services/test_services_factories.py @@ -0,0 +1,67 @@ +"""Unit tests for services.factories: construction of resolver and service.""" + +from pathlib import Path + +import pytest +import yaml + +from ere.services.entity_resolution_service import EntityResolutionService, EntityResolver +from ere.services.factories import build_entity_resolution_service, build_entity_resolver +from test.unit.adapters.stubs import StubRDFMapper + +TEST_RESOLVER_CONFIG = Path(__file__).parent.parent.parent / "resources" / "resolver.yaml" + + +def test_build_entity_resolver_returns_entity_resolver(): + resolver = build_entity_resolver(resolver_config_path=TEST_RESOLVER_CONFIG) + assert isinstance(resolver, EntityResolver) + + +def test_build_entity_resolver_uses_default_config_when_no_path_given(): + resolver = build_entity_resolver() + assert isinstance(resolver, EntityResolver) + + +def test_build_entity_resolver_with_explicit_entity_fields(): + resolver = build_entity_resolver( + entity_fields=["legal_name"], + resolver_config_path=TEST_RESOLVER_CONFIG, + ) + assert isinstance(resolver, EntityResolver) + + +def test_build_entity_resolver_with_persistent_duckdb(tmp_path): + db_file = str(tmp_path / "test.duckdb") + with open(TEST_RESOLVER_CONFIG, encoding="utf-8") as f: + raw = yaml.safe_load(f) + raw["duckdb"] = {"type": "persistent", "path": db_file} + config = tmp_path / "persistent.yaml" + config.write_text(yaml.dump(raw), encoding="utf-8") + + resolver = build_entity_resolver(resolver_config_path=config, duckdb_path=db_file) + assert isinstance(resolver, EntityResolver) + + +def test_build_entity_resolver_raises_on_invalid_duckdb_type(tmp_path): + bad_config = tmp_path / "bad.yaml" + bad_config.write_text( + "threshold: 0.8\n" + "match_weight_threshold: -10\n" + "top_n: 10\n" + "entity_fields: [legal_name]\n" + "duckdb:\n" + " type: invalid_type\n" + " path: ':memory:'\n", + encoding="utf-8", + ) + with pytest.raises(ValueError, match="Invalid duckdb type"): + build_entity_resolver(resolver_config_path=bad_config) + + +def test_build_entity_resolution_service_returns_service(): + resolver = build_entity_resolver(resolver_config_path=TEST_RESOLVER_CONFIG) + mapper = StubRDFMapper() + + service = build_entity_resolution_service(resolver, mapper) + + assert isinstance(service, EntityResolutionService) diff --git a/test/unit/test_models.py b/test/unit/test_models.py new file mode 100644 index 0000000..d984596 --- /dev/null +++ b/test/unit/test_models.py @@ -0,0 +1,111 @@ +"""Unit tests for domain model edge cases (error paths and utility methods).""" + +import pytest +from unittest.mock import MagicMock, patch + +from ere.models.resolver import ClusterId, MentionId +from ere.models.resolver.cluster import CandidateCluster, ResolutionResult +from ere.models.resolver.similarity import MentionLink + + +# ============================================================================ +# MentionLink +# ============================================================================ + + +def test_mention_link_rejects_same_left_and_right_id(): + m = MentionId(value="x") + with pytest.raises(ValueError, match="left_id and right_id must differ"): + MentionLink(left_id=m, right_id=m, score=0.9) + + +def test_mention_link_other_returns_right_when_from_is_left(): + left = MentionId(value="a") + right = MentionId(value="b") + link = MentionLink(left_id=left, right_id=right, score=0.5) + assert link.other(left) == right + + +def test_mention_link_other_returns_left_when_from_is_right(): + left = MentionId(value="a") + right = MentionId(value="b") + link = MentionLink(left_id=left, right_id=right, score=0.5) + assert link.other(right) == left + + +def test_mention_link_other_raises_when_id_not_in_link(): + left = MentionId(value="a") + right = MentionId(value="b") + unknown = MentionId(value="z") + link = MentionLink(left_id=left, right_id=right, score=0.5) + with pytest.raises(ValueError): + link.other(unknown) + + +# ============================================================================ +# ResolutionResult / CandidateCluster +# ============================================================================ + + +def test_resolution_result_rejects_empty_candidates(): + with pytest.raises(ValueError, match="must be non-empty"): + ResolutionResult(candidates=()) + + +def test_candidate_cluster_as_tuple_returns_id_and_score(): + c = CandidateCluster(cluster_id=ClusterId(value="c1"), score=0.75) + assert c.as_tuple() == ("c1", 0.75) + + +def test_resolution_result_as_tuples_returns_list(): + candidates = ( + CandidateCluster(cluster_id=ClusterId(value="c1"), score=0.9), + CandidateCluster(cluster_id=ClusterId(value="c2"), score=0.6), + ) + result = ResolutionResult(candidates=candidates) + assert result.as_tuples() == [("c1", 0.9), ("c2", 0.6)] + + +# ============================================================================ +# app.main() failure paths +# ============================================================================ + + +def test_main_exits_when_redis_connection_fails(monkeypatch): + monkeypatch.setattr("sys.argv", ["ere"]) + with patch("redis.Redis") as mock_redis_cls, \ + patch("ere.entrypoints.app.configure_logging"): + mock_redis_cls.return_value.ping.side_effect = ConnectionError("no redis") + with pytest.raises(SystemExit) as exc: + from ere.entrypoints.app import main + main() + assert exc.value.code == 1 + + +def test_main_exits_when_service_build_fails(monkeypatch): + monkeypatch.setattr("sys.argv", ["ere"]) + with patch("redis.Redis") as mock_redis_cls, \ + patch("ere.entrypoints.app.configure_logging"), \ + patch("ere.entrypoints.app.build_entity_resolver", side_effect=RuntimeError("build fail")): + mock_redis_cls.return_value.ping.return_value = True + with pytest.raises(SystemExit) as exc: + from ere.entrypoints.app import main + main() + assert exc.value.code == 1 + + +def test_main_runs_loop_until_keyboard_interrupt(monkeypatch): + monkeypatch.setattr("sys.argv", ["ere"]) + mock_resolver = MagicMock() + mock_resolver._mention_repo._con = MagicMock() + + with patch("redis.Redis") as mock_redis_cls, \ + patch("ere.entrypoints.app.configure_logging"), \ + patch("ere.entrypoints.app.build_entity_resolver", return_value=mock_resolver), \ + patch("ere.entrypoints.app.build_rdf_mapper", return_value=MagicMock()), \ + patch("ere.entrypoints.app.build_entity_resolution_service", return_value=MagicMock()), \ + patch("ere.entrypoints.app.RedisQueueWorker") as mock_worker_cls: + mock_redis_cls.return_value.ping.return_value = True + mock_worker_cls.return_value.process_single_message.side_effect = KeyboardInterrupt() + from ere.entrypoints.app import main + main() # must return cleanly (KeyboardInterrupt caught internally) diff --git a/test/unit/utils/__init__.py b/test/unit/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/unit/utils/test_logging.py b/test/unit/utils/test_logging.py new file mode 100644 index 0000000..455d1bd --- /dev/null +++ b/test/unit/utils/test_logging.py @@ -0,0 +1,54 @@ +"""Unit tests for utils.logging: log-level setup and TRACE level.""" + +import logging +from unittest.mock import call, patch + +import pytest + +from ere.utils.logging import TRACE_LEVEL_NUM, configure_logging + + +def test_configure_logging_passes_warning_level_to_basicconfig(): + with patch("logging.basicConfig") as mock_bc: + configure_logging("WARNING") + mock_bc.assert_called_once() + assert mock_bc.call_args[1]["level"] == logging.WARNING + + +def test_configure_logging_passes_trace_level_to_basicconfig(): + with patch("logging.basicConfig") as mock_bc: + configure_logging("TRACE") + mock_bc.assert_called_once() + assert mock_bc.call_args[1]["level"] == TRACE_LEVEL_NUM + + +def test_configure_logging_reads_env_var(monkeypatch): + monkeypatch.setenv("LOG_LEVEL", "ERROR") + with patch("logging.basicConfig") as mock_bc: + configure_logging() + assert mock_bc.call_args[1]["level"] == logging.ERROR + + +def test_configure_logging_defaults_to_info(monkeypatch): + monkeypatch.delenv("LOG_LEVEL", raising=False) + with patch("logging.basicConfig") as mock_bc: + configure_logging() + assert mock_bc.call_args[1]["level"] == logging.INFO + + +def test_trace_method_exists_on_logger(): + log = logging.getLogger("test.trace") + assert callable(getattr(log, "trace", None)) + + +def test_trace_method_logs_when_enabled(caplog): + log = logging.getLogger("test.trace.enabled") + with caplog.at_level(TRACE_LEVEL_NUM, logger="test.trace.enabled"): + log.trace("trace message sent") + assert "trace message sent" in caplog.text + + +def test_trace_method_does_not_log_when_disabled(): + log = logging.getLogger("test.trace.silent") + log.setLevel(logging.INFO) + log.trace("this should not explode") From c63a43134ee372e68a7aa895066fa52955802213 Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 18:02:06 +0200 Subject: [PATCH 10/14] updated project setup --- .claude/skills/gitnexus/gitnexus-cli/SKILL.md | 82 ++++++++++++ .../gitnexus/gitnexus-debugging/SKILL.md | 89 +++++++++++++ .../gitnexus/gitnexus-exploring/SKILL.md | 78 +++++++++++ .../skills/gitnexus/gitnexus-guide/SKILL.md | 64 +++++++++ .../gitnexus-impact-analysis/SKILL.md | 97 ++++++++++++++ .../gitnexus/gitnexus-refactoring/SKILL.md | 121 ++++++++++++++++++ .gitignore | 2 +- 7 files changed, 532 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/gitnexus/gitnexus-cli/SKILL.md create mode 100644 .claude/skills/gitnexus/gitnexus-debugging/SKILL.md create mode 100644 .claude/skills/gitnexus/gitnexus-exploring/SKILL.md create mode 100644 .claude/skills/gitnexus/gitnexus-guide/SKILL.md create mode 100644 .claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md create mode 100644 .claude/skills/gitnexus/gitnexus-refactoring/SKILL.md diff --git a/.claude/skills/gitnexus/gitnexus-cli/SKILL.md b/.claude/skills/gitnexus/gitnexus-cli/SKILL.md new file mode 100644 index 0000000..c9e0af3 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-cli/SKILL.md @@ -0,0 +1,82 @@ +--- +name: gitnexus-cli +description: "Use when the user needs to run GitNexus CLI commands like analyze/index a repo, check status, clean the index, generate a wiki, or list indexed repos. Examples: \"Index this repo\", \"Reanalyze the codebase\", \"Generate a wiki\"" +--- + +# GitNexus CLI Commands + +All commands work via `npx` — no global install required. + +## Commands + +### analyze — Build or refresh the index + +```bash +npx gitnexus analyze +``` + +Run from the project root. This parses all source files, builds the knowledge graph, writes it to `.gitnexus/`, and generates CLAUDE.md / AGENTS.md context files. + +| Flag | Effect | +| -------------- | ---------------------------------------------------------------- | +| `--force` | Force full re-index even if up to date | +| `--embeddings` | Enable embedding generation for semantic search (off by default) | + +**When to run:** First time in a project, after major code changes, or when `gitnexus://repo/{name}/context` reports the index is stale. In Claude Code, a PostToolUse hook runs `analyze` automatically after `git commit` and `git merge`, preserving embeddings if previously generated. + +### status — Check index freshness + +```bash +npx gitnexus status +``` + +Shows whether the current repo has a GitNexus index, when it was last updated, and symbol/relationship counts. Use this to check if re-indexing is needed. + +### clean — Delete the index + +```bash +npx gitnexus clean +``` + +Deletes the `.gitnexus/` directory and unregisters the repo from the global registry. Use before re-indexing if the index is corrupt or after removing GitNexus from a project. + +| Flag | Effect | +| --------- | ------------------------------------------------- | +| `--force` | Skip confirmation prompt | +| `--all` | Clean all indexed repos, not just the current one | + +### wiki — Generate documentation from the graph + +```bash +npx gitnexus wiki +``` + +Generates repository documentation from the knowledge graph using an LLM. Requires an API key (saved to `~/.gitnexus/config.json` on first use). + +| Flag | Effect | +| ------------------- | ----------------------------------------- | +| `--force` | Force full regeneration | +| `--model ` | LLM model (default: minimax/minimax-m2.5) | +| `--base-url ` | LLM API base URL | +| `--api-key ` | LLM API key | +| `--concurrency ` | Parallel LLM calls (default: 3) | +| `--gist` | Publish wiki as a public GitHub Gist | + +### list — Show all indexed repos + +```bash +npx gitnexus list +``` + +Lists all repositories registered in `~/.gitnexus/registry.json`. The MCP `list_repos` tool provides the same information. + +## After Indexing + +1. **Read `gitnexus://repo/{name}/context`** to verify the index loaded +2. Use the other GitNexus skills (`exploring`, `debugging`, `impact-analysis`, `refactoring`) for your task + +## Troubleshooting + +- **"Not inside a git repository"**: Run from a directory inside a git repo +- **Index is stale after re-analyzing**: Restart Claude Code to reload the MCP server +- **Embeddings slow**: Omit `--embeddings` (it's off by default) or set `OPENAI_API_KEY` for faster API-based embedding diff --git a/.claude/skills/gitnexus/gitnexus-debugging/SKILL.md b/.claude/skills/gitnexus/gitnexus-debugging/SKILL.md new file mode 100644 index 0000000..9510b97 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-debugging/SKILL.md @@ -0,0 +1,89 @@ +--- +name: gitnexus-debugging +description: "Use when the user is debugging a bug, tracing an error, or asking why something fails. Examples: \"Why is X failing?\", \"Where does this error come from?\", \"Trace this bug\"" +--- + +# Debugging with GitNexus + +## When to Use + +- "Why is this function failing?" +- "Trace where this error comes from" +- "Who calls this method?" +- "This endpoint returns 500" +- Investigating bugs, errors, or unexpected behavior + +## Workflow + +``` +1. gitnexus_query({query: ""}) → Find related execution flows +2. gitnexus_context({name: ""}) → See callers/callees/processes +3. READ gitnexus://repo/{name}/process/{name} → Trace execution flow +4. gitnexus_cypher({query: "MATCH path..."}) → Custom traces if needed +``` + +> If "Index is stale" → run `npx gitnexus analyze` in terminal. + +## Checklist + +``` +- [ ] Understand the symptom (error message, unexpected behavior) +- [ ] gitnexus_query for error text or related code +- [ ] Identify the suspect function from returned processes +- [ ] gitnexus_context to see callers and callees +- [ ] Trace execution flow via process resource if applicable +- [ ] gitnexus_cypher for custom call chain traces if needed +- [ ] Read source files to confirm root cause +``` + +## Debugging Patterns + +| Symptom | GitNexus Approach | +| -------------------- | ---------------------------------------------------------- | +| Error message | `gitnexus_query` for error text → `context` on throw sites | +| Wrong return value | `context` on the function → trace callees for data flow | +| Intermittent failure | `context` → look for external calls, async deps | +| Performance issue | `context` → find symbols with many callers (hot paths) | +| Recent regression | `detect_changes` to see what your changes affect | + +## Tools + +**gitnexus_query** — find code related to error: + +``` +gitnexus_query({query: "payment validation error"}) +→ Processes: CheckoutFlow, ErrorHandling +→ Symbols: validatePayment, handlePaymentError, PaymentException +``` + +**gitnexus_context** — full context for a suspect: + +``` +gitnexus_context({name: "validatePayment"}) +→ Incoming calls: processCheckout, webhookHandler +→ Outgoing calls: verifyCard, fetchRates (external API!) +→ Processes: CheckoutFlow (step 3/7) +``` + +**gitnexus_cypher** — custom call chain traces: + +```cypher +MATCH path = (a)-[:CodeRelation {type: 'CALLS'}*1..2]->(b:Function {name: "validatePayment"}) +RETURN [n IN nodes(path) | n.name] AS chain +``` + +## Example: "Payment endpoint returns 500 intermittently" + +``` +1. gitnexus_query({query: "payment error handling"}) + → Processes: CheckoutFlow, ErrorHandling + → Symbols: validatePayment, handlePaymentError + +2. gitnexus_context({name: "validatePayment"}) + → Outgoing calls: verifyCard, fetchRates (external API!) + +3. READ gitnexus://repo/my-app/process/CheckoutFlow + → Step 3: validatePayment → calls fetchRates (external) + +4. Root cause: fetchRates calls external API without proper timeout +``` diff --git a/.claude/skills/gitnexus/gitnexus-exploring/SKILL.md b/.claude/skills/gitnexus/gitnexus-exploring/SKILL.md new file mode 100644 index 0000000..927a4e4 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-exploring/SKILL.md @@ -0,0 +1,78 @@ +--- +name: gitnexus-exploring +description: "Use when the user asks how code works, wants to understand architecture, trace execution flows, or explore unfamiliar parts of the codebase. Examples: \"How does X work?\", \"What calls this function?\", \"Show me the auth flow\"" +--- + +# Exploring Codebases with GitNexus + +## When to Use + +- "How does authentication work?" +- "What's the project structure?" +- "Show me the main components" +- "Where is the database logic?" +- Understanding code you haven't seen before + +## Workflow + +``` +1. READ gitnexus://repos → Discover indexed repos +2. READ gitnexus://repo/{name}/context → Codebase overview, check staleness +3. gitnexus_query({query: ""}) → Find related execution flows +4. gitnexus_context({name: ""}) → Deep dive on specific symbol +5. READ gitnexus://repo/{name}/process/{name} → Trace full execution flow +``` + +> If step 2 says "Index is stale" → run `npx gitnexus analyze` in terminal. + +## Checklist + +``` +- [ ] READ gitnexus://repo/{name}/context +- [ ] gitnexus_query for the concept you want to understand +- [ ] Review returned processes (execution flows) +- [ ] gitnexus_context on key symbols for callers/callees +- [ ] READ process resource for full execution traces +- [ ] Read source files for implementation details +``` + +## Resources + +| Resource | What you get | +| --------------------------------------- | ------------------------------------------------------- | +| `gitnexus://repo/{name}/context` | Stats, staleness warning (~150 tokens) | +| `gitnexus://repo/{name}/clusters` | All functional areas with cohesion scores (~300 tokens) | +| `gitnexus://repo/{name}/cluster/{name}` | Area members with file paths (~500 tokens) | +| `gitnexus://repo/{name}/process/{name}` | Step-by-step execution trace (~200 tokens) | + +## Tools + +**gitnexus_query** — find execution flows related to a concept: + +``` +gitnexus_query({query: "payment processing"}) +→ Processes: CheckoutFlow, RefundFlow, WebhookHandler +→ Symbols grouped by flow with file locations +``` + +**gitnexus_context** — 360-degree view of a symbol: + +``` +gitnexus_context({name: "validateUser"}) +→ Incoming calls: loginHandler, apiMiddleware +→ Outgoing calls: checkToken, getUserById +→ Processes: LoginFlow (step 2/5), TokenRefresh (step 1/3) +``` + +## Example: "How does payment processing work?" + +``` +1. READ gitnexus://repo/my-app/context → 918 symbols, 45 processes +2. gitnexus_query({query: "payment processing"}) + → CheckoutFlow: processPayment → validateCard → chargeStripe + → RefundFlow: initiateRefund → calculateRefund → processRefund +3. gitnexus_context({name: "processPayment"}) + → Incoming: checkoutHandler, webhookHandler + → Outgoing: validateCard, chargeStripe, saveTransaction +4. Read src/payments/processor.ts for implementation details +``` diff --git a/.claude/skills/gitnexus/gitnexus-guide/SKILL.md b/.claude/skills/gitnexus/gitnexus-guide/SKILL.md new file mode 100644 index 0000000..937ac73 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-guide/SKILL.md @@ -0,0 +1,64 @@ +--- +name: gitnexus-guide +description: "Use when the user asks about GitNexus itself — available tools, how to query the knowledge graph, MCP resources, graph schema, or workflow reference. Examples: \"What GitNexus tools are available?\", \"How do I use GitNexus?\"" +--- + +# GitNexus Guide + +Quick reference for all GitNexus MCP tools, resources, and the knowledge graph schema. + +## Always Start Here + +For any task involving code understanding, debugging, impact analysis, or refactoring: + +1. **Read `gitnexus://repo/{name}/context`** — codebase overview + check index freshness +2. **Match your task to a skill below** and **read that skill file** +3. **Follow the skill's workflow and checklist** + +> If step 1 warns the index is stale, run `npx gitnexus analyze` in the terminal first. + +## Skills + +| Task | Skill to read | +| -------------------------------------------- | ------------------- | +| Understand architecture / "How does X work?" | `gitnexus-exploring` | +| Blast radius / "What breaks if I change X?" | `gitnexus-impact-analysis` | +| Trace bugs / "Why is X failing?" | `gitnexus-debugging` | +| Rename / extract / split / refactor | `gitnexus-refactoring` | +| Tools, resources, schema reference | `gitnexus-guide` (this file) | +| Index, status, clean, wiki CLI commands | `gitnexus-cli` | + +## Tools Reference + +| Tool | What it gives you | +| ---------------- | ------------------------------------------------------------------------ | +| `query` | Process-grouped code intelligence — execution flows related to a concept | +| `context` | 360-degree symbol view — categorized refs, processes it participates in | +| `impact` | Symbol blast radius — what breaks at depth 1/2/3 with confidence | +| `detect_changes` | Git-diff impact — what do your current changes affect | +| `rename` | Multi-file coordinated rename with confidence-tagged edits | +| `cypher` | Raw graph queries (read `gitnexus://repo/{name}/schema` first) | +| `list_repos` | Discover indexed repos | + +## Resources Reference + +Lightweight reads (~100-500 tokens) for navigation: + +| Resource | Content | +| ---------------------------------------------- | ----------------------------------------- | +| `gitnexus://repo/{name}/context` | Stats, staleness check | +| `gitnexus://repo/{name}/clusters` | All functional areas with cohesion scores | +| `gitnexus://repo/{name}/cluster/{clusterName}` | Area members | +| `gitnexus://repo/{name}/processes` | All execution flows | +| `gitnexus://repo/{name}/process/{processName}` | Step-by-step trace | +| `gitnexus://repo/{name}/schema` | Graph schema for Cypher | + +## Graph Schema + +**Nodes:** File, Function, Class, Interface, Method, Community, Process +**Edges (via CodeRelation.type):** CALLS, IMPORTS, EXTENDS, IMPLEMENTS, DEFINES, MEMBER_OF, STEP_IN_PROCESS + +```cypher +MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(f:Function {name: "myFunc"}) +RETURN caller.name, caller.filePath +``` diff --git a/.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md b/.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md new file mode 100644 index 0000000..e19af28 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-impact-analysis/SKILL.md @@ -0,0 +1,97 @@ +--- +name: gitnexus-impact-analysis +description: "Use when the user wants to know what will break if they change something, or needs safety analysis before editing code. Examples: \"Is it safe to change X?\", \"What depends on this?\", \"What will break?\"" +--- + +# Impact Analysis with GitNexus + +## When to Use + +- "Is it safe to change this function?" +- "What will break if I modify X?" +- "Show me the blast radius" +- "Who uses this code?" +- Before making non-trivial code changes +- Before committing — to understand what your changes affect + +## Workflow + +``` +1. gitnexus_impact({target: "X", direction: "upstream"}) → What depends on this +2. READ gitnexus://repo/{name}/processes → Check affected execution flows +3. gitnexus_detect_changes() → Map current git changes to affected flows +4. Assess risk and report to user +``` + +> If "Index is stale" → run `npx gitnexus analyze` in terminal. + +## Checklist + +``` +- [ ] gitnexus_impact({target, direction: "upstream"}) to find dependents +- [ ] Review d=1 items first (these WILL BREAK) +- [ ] Check high-confidence (>0.8) dependencies +- [ ] READ processes to check affected execution flows +- [ ] gitnexus_detect_changes() for pre-commit check +- [ ] Assess risk level and report to user +``` + +## Understanding Output + +| Depth | Risk Level | Meaning | +| ----- | ---------------- | ------------------------ | +| d=1 | **WILL BREAK** | Direct callers/importers | +| d=2 | LIKELY AFFECTED | Indirect dependencies | +| d=3 | MAY NEED TESTING | Transitive effects | + +## Risk Assessment + +| Affected | Risk | +| ------------------------------ | -------- | +| <5 symbols, few processes | LOW | +| 5-15 symbols, 2-5 processes | MEDIUM | +| >15 symbols or many processes | HIGH | +| Critical path (auth, payments) | CRITICAL | + +## Tools + +**gitnexus_impact** — the primary tool for symbol blast radius: + +``` +gitnexus_impact({ + target: "validateUser", + direction: "upstream", + minConfidence: 0.8, + maxDepth: 3 +}) + +→ d=1 (WILL BREAK): + - loginHandler (src/auth/login.ts:42) [CALLS, 100%] + - apiMiddleware (src/api/middleware.ts:15) [CALLS, 100%] + +→ d=2 (LIKELY AFFECTED): + - authRouter (src/routes/auth.ts:22) [CALLS, 95%] +``` + +**gitnexus_detect_changes** — git-diff based impact analysis: + +``` +gitnexus_detect_changes({scope: "staged"}) + +→ Changed: 5 symbols in 3 files +→ Affected: LoginFlow, TokenRefresh, APIMiddlewarePipeline +→ Risk: MEDIUM +``` + +## Example: "What breaks if I change validateUser?" + +``` +1. gitnexus_impact({target: "validateUser", direction: "upstream"}) + → d=1: loginHandler, apiMiddleware (WILL BREAK) + → d=2: authRouter, sessionManager (LIKELY AFFECTED) + +2. READ gitnexus://repo/my-app/processes + → LoginFlow and TokenRefresh touch validateUser + +3. Risk: 2 direct callers, 2 processes = MEDIUM +``` diff --git a/.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md b/.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md new file mode 100644 index 0000000..f48cc01 --- /dev/null +++ b/.claude/skills/gitnexus/gitnexus-refactoring/SKILL.md @@ -0,0 +1,121 @@ +--- +name: gitnexus-refactoring +description: "Use when the user wants to rename, extract, split, move, or restructure code safely. Examples: \"Rename this function\", \"Extract this into a module\", \"Refactor this class\", \"Move this to a separate file\"" +--- + +# Refactoring with GitNexus + +## When to Use + +- "Rename this function safely" +- "Extract this into a module" +- "Split this service" +- "Move this to a new file" +- Any task involving renaming, extracting, splitting, or restructuring code + +## Workflow + +``` +1. gitnexus_impact({target: "X", direction: "upstream"}) → Map all dependents +2. gitnexus_query({query: "X"}) → Find execution flows involving X +3. gitnexus_context({name: "X"}) → See all incoming/outgoing refs +4. Plan update order: interfaces → implementations → callers → tests +``` + +> If "Index is stale" → run `npx gitnexus analyze` in terminal. + +## Checklists + +### Rename Symbol + +``` +- [ ] gitnexus_rename({symbol_name: "oldName", new_name: "newName", dry_run: true}) — preview all edits +- [ ] Review graph edits (high confidence) and ast_search edits (review carefully) +- [ ] If satisfied: gitnexus_rename({..., dry_run: false}) — apply edits +- [ ] gitnexus_detect_changes() — verify only expected files changed +- [ ] Run tests for affected processes +``` + +### Extract Module + +``` +- [ ] gitnexus_context({name: target}) — see all incoming/outgoing refs +- [ ] gitnexus_impact({target, direction: "upstream"}) — find all external callers +- [ ] Define new module interface +- [ ] Extract code, update imports +- [ ] gitnexus_detect_changes() — verify affected scope +- [ ] Run tests for affected processes +``` + +### Split Function/Service + +``` +- [ ] gitnexus_context({name: target}) — understand all callees +- [ ] Group callees by responsibility +- [ ] gitnexus_impact({target, direction: "upstream"}) — map callers to update +- [ ] Create new functions/services +- [ ] Update callers +- [ ] gitnexus_detect_changes() — verify affected scope +- [ ] Run tests for affected processes +``` + +## Tools + +**gitnexus_rename** — automated multi-file rename: + +``` +gitnexus_rename({symbol_name: "validateUser", new_name: "authenticateUser", dry_run: true}) +→ 12 edits across 8 files +→ 10 graph edits (high confidence), 2 ast_search edits (review) +→ Changes: [{file_path, edits: [{line, old_text, new_text, confidence}]}] +``` + +**gitnexus_impact** — map all dependents first: + +``` +gitnexus_impact({target: "validateUser", direction: "upstream"}) +→ d=1: loginHandler, apiMiddleware, testUtils +→ Affected Processes: LoginFlow, TokenRefresh +``` + +**gitnexus_detect_changes** — verify your changes after refactoring: + +``` +gitnexus_detect_changes({scope: "all"}) +→ Changed: 8 files, 12 symbols +→ Affected processes: LoginFlow, TokenRefresh +→ Risk: MEDIUM +``` + +**gitnexus_cypher** — custom reference queries: + +```cypher +MATCH (caller)-[:CodeRelation {type: 'CALLS'}]->(f:Function {name: "validateUser"}) +RETURN caller.name, caller.filePath ORDER BY caller.filePath +``` + +## Risk Rules + +| Risk Factor | Mitigation | +| ------------------- | ----------------------------------------- | +| Many callers (>5) | Use gitnexus_rename for automated updates | +| Cross-area refs | Use detect_changes after to verify scope | +| String/dynamic refs | gitnexus_query to find them | +| External/public API | Version and deprecate properly | + +## Example: Rename `validateUser` to `authenticateUser` + +``` +1. gitnexus_rename({symbol_name: "validateUser", new_name: "authenticateUser", dry_run: true}) + → 12 edits: 10 graph (safe), 2 ast_search (review) + → Files: validator.ts, login.ts, middleware.ts, config.json... + +2. Review ast_search edits (config.json: dynamic reference!) + +3. gitnexus_rename({symbol_name: "validateUser", new_name: "authenticateUser", dry_run: false}) + → Applied 12 edits across 8 files + +4. gitnexus_detect_changes({scope: "all"}) + → Affected: LoginFlow, TokenRefresh + → Risk: MEDIUM — run tests for these flows +``` diff --git a/.gitignore b/.gitignore index 65f3ffe..6379a8e 100644 --- a/.gitignore +++ b/.gitignore @@ -216,4 +216,4 @@ poetry.toml .vscode .import_linter_cache .pycharm_plugin - +.idea From d5dd2460c422accce43994910d5a285f0e7f10e3 Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 18:12:49 +0200 Subject: [PATCH 11/14] chore(infra): align env vars with ERSys naming convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames LOG_LEVEL → ERE_LOG_LEVEL to match the ERSys unified .env.example, allowing integration tests to run against the shared ERSys infrastructure without any compose-level variable mapping. Changes: - src/ere/utils/logging.py: read ERE_LOG_LEVEL instead of LOG_LEVEL - src/ere/entrypoints/app.py: update env var name in docstring - demo/demo.py: read ERE_LOG_LEVEL instead of LOG_LEVEL - test/unit/utils/test_logging.py: update env var references - infra/.env.example: new file, ERE-relevant subset of ERSys .env.example - infra/compose.dev.yaml: remove LOG_LEVEL mapping (no longer needed) --- demo/demo.py | 2 +- infra/.env.example | 19 ++++++++++++------- src/ere/entrypoints/app.py | 2 +- src/ere/utils/logging.py | 4 ++-- test/unit/utils/test_logging.py | 4 ++-- 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/demo/demo.py b/demo/demo.py index 1a06939..6bf2570 100755 --- a/demo/demo.py +++ b/demo/demo.py @@ -93,7 +93,7 @@ def load_env_file(env_path: str = None) -> dict: def setup_logging(): """Configure logging with timestamps.""" - log_level_name = os.environ.get("LOG_LEVEL", "INFO").upper() + log_level_name = os.environ.get("ERE_LOG_LEVEL", "INFO").upper() # Handle custom TRACE level if log_level_name == "TRACE": diff --git a/infra/.env.example b/infra/.env.example index 0057f84..80795f5 100644 --- a/infra/.env.example +++ b/infra/.env.example @@ -1,18 +1,23 @@ -# Copy this file to .env and customize as needed: -# cp infra/.env.example infra/.env +# ERE local development environment +# Copy to infra/.env and customise: cp infra/.env.example infra/.env +# +# Compatible with the ERSys unified environment (infra/.env.example). +# When running ERE standalone, use this file with infra/compose.dev.yaml. +# When running inside the full ERSys stack, the parent project's .env covers these. -# Redis +# --- Redis --- REDIS_HOST=redis REDIS_PORT=6379 REDIS_DB=0 REDIS_PASSWORD=changeme -# Queue names +# --- Queues --- REQUEST_QUEUE=ere_requests RESPONSE_QUEUE=ere_responses -# DuckDB (path inside container, volume-mounted) +# --- Storage --- DUCKDB_PATH=/data/app.duckdb -# Logging -LOG_LEVEL=INFO +# --- Logging --- +# ERSys uses ERE_LOG_LEVEL; compose.dev.yaml maps it to LOG_LEVEL internally. +ERE_LOG_LEVEL=INFO diff --git a/src/ere/entrypoints/app.py b/src/ere/entrypoints/app.py index e4077db..6a6fbd7 100644 --- a/src/ere/entrypoints/app.py +++ b/src/ere/entrypoints/app.py @@ -12,7 +12,7 @@ REDIS_HOST Redis hostname (default: localhost) REDIS_PORT Redis port (default: 6379) REDIS_DB Redis DB index (default: 0) - LOG_LEVEL Python log level name (default: INFO) — supports TRACE + ERE_LOG_LEVEL Python log level name (default: INFO) — supports TRACE RDF_MAPPING_PATH Path to rdf_mapping.yaml config file RESOLVER_CONFIG_PATH Path to resolver.yaml config file DUCKDB_PATH Path to persistent DuckDB file (overrides resolver.yaml) diff --git a/src/ere/utils/logging.py b/src/ere/utils/logging.py index 9100a1b..70e36a3 100644 --- a/src/ere/utils/logging.py +++ b/src/ere/utils/logging.py @@ -26,10 +26,10 @@ def configure_logging(log_level: str = None) -> None: Args: log_level: Log level name (e.g., 'DEBUG', 'INFO', 'TRACE'). - If None, reads from LOG_LEVEL environment variable (default: INFO). + If None, reads from ERE_LOG_LEVEL environment variable (default: INFO). """ if log_level is None: - log_level = os.environ.get("LOG_LEVEL", "INFO").upper() + log_level = os.environ.get("ERE_LOG_LEVEL", "INFO").upper() else: log_level = log_level.upper() diff --git a/test/unit/utils/test_logging.py b/test/unit/utils/test_logging.py index 455d1bd..b285a0d 100644 --- a/test/unit/utils/test_logging.py +++ b/test/unit/utils/test_logging.py @@ -23,14 +23,14 @@ def test_configure_logging_passes_trace_level_to_basicconfig(): def test_configure_logging_reads_env_var(monkeypatch): - monkeypatch.setenv("LOG_LEVEL", "ERROR") + monkeypatch.setenv("ERE_LOG_LEVEL", "ERROR") with patch("logging.basicConfig") as mock_bc: configure_logging() assert mock_bc.call_args[1]["level"] == logging.ERROR def test_configure_logging_defaults_to_info(monkeypatch): - monkeypatch.delenv("LOG_LEVEL", raising=False) + monkeypatch.delenv("ERE_LOG_LEVEL", raising=False) with patch("logging.basicConfig") as mock_bc: configure_logging() assert mock_bc.call_args[1]["level"] == logging.INFO From 19e787c1817d2cb74f443287a8f9872a3937651b Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 22:17:21 +0200 Subject: [PATCH 12/14] docs(agents): add ERE-specific agent operating instructions AGENTS.md (and its CLAUDE.md mirror) now contains ERE-specific guidance: commits/PR rules, dev workflow, make targets reference, architecture rules, memory conventions, and gotchas. Replaces GitNexus-only boilerplate. Also aligns dev tooling: - Makefile: test-integration depends on check-env; test target sources .env - infra/.env.example: REDIS_HOST defaults to localhost for standalone dev - test/e2e/test_app.py: replace walrus operator with explicit env default --- AGENTS.md | 219 +++++++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 219 +++++++++++++++++++++++++++++++++++++++++++ Makefile | 8 +- infra/.env.example | 2 +- test/e2e/test_app.py | 3 +- 5 files changed, 444 insertions(+), 7 deletions(-) create mode 100644 AGENTS.md create mode 100644 CLAUDE.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..26b1f0f --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,219 @@ +# ERE — Agent Operating Instructions + +This file governs how AI agents operate in this repository. +It complements `CLAUDE.md` (which governs Claude Code specifically) and `.claude/CLAUDE.md` (project instructions). + +--- + +## Commits and PRs + +- **Never auto-commit** unless the user explicitly asks. +- **Never force-push** to `main` or `develop`. +- **Never add co-author lines**, tool names, or agent names to commit messages. +- Commit format: `type(scope): concise description` — e.g. `feat(adapters): add splink resolver factory`. +- Stage only files you modified: `git add `, never `git add -A` blindly. +- Before committing, run `make lint` and `make test-unit` to verify nothing is broken. +- PRs target `develop` (not `main`) unless told otherwise. +- When creating a PR, include a short summary and a test-plan checklist. + +--- + +## Working Methodology + +### Before touching code + +1. Read `WORKING.md` — it points to the active task file. +2. Read the referenced `docs/tasks/yyyy-mm-dd-*.md` fully. +3. Understand the current branch state: `git log --oneline -10`. + +### Running the stack for integration tests + +Integration tests require Redis to be running. Start it first: + +```bash +make infra-up # starts Redis + RedisInsight via Docker Compose +make test-integration # then run integration tests +make infra-down # tear down when done +``` + +Unit tests do **not** require any infrastructure: + +```bash +make test-unit # fast, self-contained, uses your venv +``` + +### Typical development loop + +```bash +make install # first time or after pyproject.toml changes +make test-unit # red → green → refactor +make lint # quick style check +make check-architecture # verify import-linter contracts +make all-quality-checks # before opening a PR +``` + +--- + +## Tooling Reference + +| Target | What it does | +|--------|-------------| +| `make install` | Install deps via Poetry | +| `make test-unit` | pytest unit suite + coverage report | +| `make test-integration` | integration tests (Redis must be up) | +| `make test-coverage` | HTML coverage report → `htmlcov/index.html` | +| `make lint` | pylint (fast, your venv) | +| `make format` | Ruff formatter | +| `make lint-fix` | Ruff auto-fix | +| `make check-clean-code` | pylint + radon + xenon (tox isolated) | +| `make check-architecture` | import-linter contracts (tox isolated) | +| `make all-quality-checks` | lint + clean-code + architecture | +| `make ci` | full tox pipeline (py312 + architecture + clean-code) | +| `make infra-up` | Start Redis stack (Docker Compose) | +| `make infra-down` | Stop Redis stack | +| `make infra-watch` | Live-reload mode (syncs `src/` and `config/`) | + +--- + +## Architecture Rules (enforced by import-linter) + +Dependency direction must never be violated: + +``` +entrypoints → services → models + ↘ + adapters → models +``` + +- `models/` — no I/O, no framework imports, no side effects. +- `adapters/` — infrastructure only; never calls `services/`. +- `services/` — orchestrates domain and adapters; never imports from `entrypoints/`. +- `entrypoints/` — parses input, calls services, formats output; no business logic. + +Violations block CI. Check with `make check-architecture` before opening a PR. + +--- + +## Memory Conventions + +Save to memory only what is non-obvious and persists across conversations: + +- Architectural decisions that aren't evident from the code (e.g. resolver factory registry pattern, DuckDB threading model). +- Design constraints explained by the user that aren't in comments or docs. +- User preferences about how to collaborate (e.g. "never suggest walrus operators", "prefer explicit factory injection"). + +Do **not** save to memory: +- Current task state (use the task file in `docs/tasks/`). +- Git history or recent changes (readable via `git log`). +- File paths or code structure (readable from the repo). + +--- + +## Gotchas + +- **`logging.basicConfig` is a no-op** when handlers already exist (conftest sets them up via `dictConfig`). Mock it with `patch("logging.basicConfig")` in logging tests. +- **DuckDB in tests**: use in-memory mode (`:memory:`) or a temp file via `tmp_path`; never a fixed path that leaks between tests. +- **Integration tests are marked** with `@pytest.mark.integration` — `make test-unit` skips them automatically. +- **`infra/.env`** is required for `make infra-*` targets. Copy from `infra/.env.example` on first use. +- **Config files** live in `config/` (repo root), not `infra/config/` — the `1cf319c` refactor moved them. +- **erspec models** are LinkML-generated with snake_case fields (e.g. `legal_name`, not `legalName`). Do not edit generated files — update the schema and regenerate. +- **`ERE_LOG_LEVEL`** is the canonical env var for log level in this service (not `LOG_LEVEL`). + +--- + + +# GitNexus — Code Intelligence + +This project is indexed by GitNexus as **entity-resolution-engine-basic** (528 symbols, 1372 relationships, 36 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. + +> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. + +## Always Do + +- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. +- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. +- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. +- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. +- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. + +## When Debugging + +1. `gitnexus_query({query: ""})` — find execution flows related to the issue +2. `gitnexus_context({name: ""})` — see all callers, callees, and process participation +3. `READ gitnexus://repo/entity-resolution-engine-basic/process/{processName}` — trace the full execution flow step by step +4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed + +## When Refactoring + +- **Renaming**: MUST use `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` first. Review the preview — graph edits are safe, text_search edits need manual review. Then run with `dry_run: false`. +- **Extracting/Splitting**: MUST run `gitnexus_context({name: "target"})` to see all incoming/outgoing refs, then `gitnexus_impact({target: "target", direction: "upstream"})` to find all external callers before moving code. +- After any refactor: run `gitnexus_detect_changes({scope: "all"})` to verify only expected files changed. + +## Never Do + +- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. +- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. +- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. +- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. + +## Tools Quick Reference + +| Tool | When to use | Command | +|------|-------------|---------| +| `query` | Find code by concept | `gitnexus_query({query: "auth validation"})` | +| `context` | 360-degree view of one symbol | `gitnexus_context({name: "validateUser"})` | +| `impact` | Blast radius before editing | `gitnexus_impact({target: "X", direction: "upstream"})` | +| `detect_changes` | Pre-commit scope check | `gitnexus_detect_changes({scope: "staged"})` | +| `rename` | Safe multi-file rename | `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` | +| `cypher` | Custom graph queries | `gitnexus_cypher({query: "MATCH ..."})` | + +## Impact Risk Levels + +| Depth | Meaning | Action | +|-------|---------|--------| +| d=1 | WILL BREAK — direct callers/importers | MUST update these | +| d=2 | LIKELY AFFECTED — indirect deps | Should test | +| d=3 | MAY NEED TESTING — transitive | Test if critical path | + +## Resources + +| Resource | Use for | +|----------|---------| +| `gitnexus://repo/entity-resolution-engine-basic/context` | Codebase overview, check index freshness | +| `gitnexus://repo/entity-resolution-engine-basic/clusters` | All functional areas | +| `gitnexus://repo/entity-resolution-engine-basic/processes` | All execution flows | +| `gitnexus://repo/entity-resolution-engine-basic/process/{name}` | Step-by-step execution trace | + +## Self-Check Before Finishing + +Before completing any code modification task, verify: +1. `gitnexus_impact` was run for all modified symbols +2. No HIGH/CRITICAL risk warnings were ignored +3. `gitnexus_detect_changes()` confirms changes match expected scope +4. All d=1 (WILL BREAK) dependents were updated + +## Keeping the Index Fresh + +After committing code changes, the GitNexus index becomes stale. Re-run analyze to update it: + +```bash +npx gitnexus analyze +``` + +If the index previously included embeddings, preserve them by adding `--embeddings`: + +```bash +npx gitnexus analyze --embeddings +``` + +To check whether embeddings exist, inspect `.gitnexus/meta.json` — the `stats.embeddings` field shows the count (0 means no embeddings). **Running analyze without `--embeddings` will delete any previously generated embeddings.** + +> Claude Code users: A PostToolUse hook handles this automatically after `git commit` and `git merge`. + +## CLI + +- Re-index: `npx gitnexus analyze` +- Check freshness: `npx gitnexus status` +- Generate docs: `npx gitnexus wiki` + + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..26b1f0f --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,219 @@ +# ERE — Agent Operating Instructions + +This file governs how AI agents operate in this repository. +It complements `CLAUDE.md` (which governs Claude Code specifically) and `.claude/CLAUDE.md` (project instructions). + +--- + +## Commits and PRs + +- **Never auto-commit** unless the user explicitly asks. +- **Never force-push** to `main` or `develop`. +- **Never add co-author lines**, tool names, or agent names to commit messages. +- Commit format: `type(scope): concise description` — e.g. `feat(adapters): add splink resolver factory`. +- Stage only files you modified: `git add `, never `git add -A` blindly. +- Before committing, run `make lint` and `make test-unit` to verify nothing is broken. +- PRs target `develop` (not `main`) unless told otherwise. +- When creating a PR, include a short summary and a test-plan checklist. + +--- + +## Working Methodology + +### Before touching code + +1. Read `WORKING.md` — it points to the active task file. +2. Read the referenced `docs/tasks/yyyy-mm-dd-*.md` fully. +3. Understand the current branch state: `git log --oneline -10`. + +### Running the stack for integration tests + +Integration tests require Redis to be running. Start it first: + +```bash +make infra-up # starts Redis + RedisInsight via Docker Compose +make test-integration # then run integration tests +make infra-down # tear down when done +``` + +Unit tests do **not** require any infrastructure: + +```bash +make test-unit # fast, self-contained, uses your venv +``` + +### Typical development loop + +```bash +make install # first time or after pyproject.toml changes +make test-unit # red → green → refactor +make lint # quick style check +make check-architecture # verify import-linter contracts +make all-quality-checks # before opening a PR +``` + +--- + +## Tooling Reference + +| Target | What it does | +|--------|-------------| +| `make install` | Install deps via Poetry | +| `make test-unit` | pytest unit suite + coverage report | +| `make test-integration` | integration tests (Redis must be up) | +| `make test-coverage` | HTML coverage report → `htmlcov/index.html` | +| `make lint` | pylint (fast, your venv) | +| `make format` | Ruff formatter | +| `make lint-fix` | Ruff auto-fix | +| `make check-clean-code` | pylint + radon + xenon (tox isolated) | +| `make check-architecture` | import-linter contracts (tox isolated) | +| `make all-quality-checks` | lint + clean-code + architecture | +| `make ci` | full tox pipeline (py312 + architecture + clean-code) | +| `make infra-up` | Start Redis stack (Docker Compose) | +| `make infra-down` | Stop Redis stack | +| `make infra-watch` | Live-reload mode (syncs `src/` and `config/`) | + +--- + +## Architecture Rules (enforced by import-linter) + +Dependency direction must never be violated: + +``` +entrypoints → services → models + ↘ + adapters → models +``` + +- `models/` — no I/O, no framework imports, no side effects. +- `adapters/` — infrastructure only; never calls `services/`. +- `services/` — orchestrates domain and adapters; never imports from `entrypoints/`. +- `entrypoints/` — parses input, calls services, formats output; no business logic. + +Violations block CI. Check with `make check-architecture` before opening a PR. + +--- + +## Memory Conventions + +Save to memory only what is non-obvious and persists across conversations: + +- Architectural decisions that aren't evident from the code (e.g. resolver factory registry pattern, DuckDB threading model). +- Design constraints explained by the user that aren't in comments or docs. +- User preferences about how to collaborate (e.g. "never suggest walrus operators", "prefer explicit factory injection"). + +Do **not** save to memory: +- Current task state (use the task file in `docs/tasks/`). +- Git history or recent changes (readable via `git log`). +- File paths or code structure (readable from the repo). + +--- + +## Gotchas + +- **`logging.basicConfig` is a no-op** when handlers already exist (conftest sets them up via `dictConfig`). Mock it with `patch("logging.basicConfig")` in logging tests. +- **DuckDB in tests**: use in-memory mode (`:memory:`) or a temp file via `tmp_path`; never a fixed path that leaks between tests. +- **Integration tests are marked** with `@pytest.mark.integration` — `make test-unit` skips them automatically. +- **`infra/.env`** is required for `make infra-*` targets. Copy from `infra/.env.example` on first use. +- **Config files** live in `config/` (repo root), not `infra/config/` — the `1cf319c` refactor moved them. +- **erspec models** are LinkML-generated with snake_case fields (e.g. `legal_name`, not `legalName`). Do not edit generated files — update the schema and regenerate. +- **`ERE_LOG_LEVEL`** is the canonical env var for log level in this service (not `LOG_LEVEL`). + +--- + + +# GitNexus — Code Intelligence + +This project is indexed by GitNexus as **entity-resolution-engine-basic** (528 symbols, 1372 relationships, 36 execution flows). Use the GitNexus MCP tools to understand code, assess impact, and navigate safely. + +> If any GitNexus tool warns the index is stale, run `npx gitnexus analyze` in terminal first. + +## Always Do + +- **MUST run impact analysis before editing any symbol.** Before modifying a function, class, or method, run `gitnexus_impact({target: "symbolName", direction: "upstream"})` and report the blast radius (direct callers, affected processes, risk level) to the user. +- **MUST run `gitnexus_detect_changes()` before committing** to verify your changes only affect expected symbols and execution flows. +- **MUST warn the user** if impact analysis returns HIGH or CRITICAL risk before proceeding with edits. +- When exploring unfamiliar code, use `gitnexus_query({query: "concept"})` to find execution flows instead of grepping. It returns process-grouped results ranked by relevance. +- When you need full context on a specific symbol — callers, callees, which execution flows it participates in — use `gitnexus_context({name: "symbolName"})`. + +## When Debugging + +1. `gitnexus_query({query: ""})` — find execution flows related to the issue +2. `gitnexus_context({name: ""})` — see all callers, callees, and process participation +3. `READ gitnexus://repo/entity-resolution-engine-basic/process/{processName}` — trace the full execution flow step by step +4. For regressions: `gitnexus_detect_changes({scope: "compare", base_ref: "main"})` — see what your branch changed + +## When Refactoring + +- **Renaming**: MUST use `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` first. Review the preview — graph edits are safe, text_search edits need manual review. Then run with `dry_run: false`. +- **Extracting/Splitting**: MUST run `gitnexus_context({name: "target"})` to see all incoming/outgoing refs, then `gitnexus_impact({target: "target", direction: "upstream"})` to find all external callers before moving code. +- After any refactor: run `gitnexus_detect_changes({scope: "all"})` to verify only expected files changed. + +## Never Do + +- NEVER edit a function, class, or method without first running `gitnexus_impact` on it. +- NEVER ignore HIGH or CRITICAL risk warnings from impact analysis. +- NEVER rename symbols with find-and-replace — use `gitnexus_rename` which understands the call graph. +- NEVER commit changes without running `gitnexus_detect_changes()` to check affected scope. + +## Tools Quick Reference + +| Tool | When to use | Command | +|------|-------------|---------| +| `query` | Find code by concept | `gitnexus_query({query: "auth validation"})` | +| `context` | 360-degree view of one symbol | `gitnexus_context({name: "validateUser"})` | +| `impact` | Blast radius before editing | `gitnexus_impact({target: "X", direction: "upstream"})` | +| `detect_changes` | Pre-commit scope check | `gitnexus_detect_changes({scope: "staged"})` | +| `rename` | Safe multi-file rename | `gitnexus_rename({symbol_name: "old", new_name: "new", dry_run: true})` | +| `cypher` | Custom graph queries | `gitnexus_cypher({query: "MATCH ..."})` | + +## Impact Risk Levels + +| Depth | Meaning | Action | +|-------|---------|--------| +| d=1 | WILL BREAK — direct callers/importers | MUST update these | +| d=2 | LIKELY AFFECTED — indirect deps | Should test | +| d=3 | MAY NEED TESTING — transitive | Test if critical path | + +## Resources + +| Resource | Use for | +|----------|---------| +| `gitnexus://repo/entity-resolution-engine-basic/context` | Codebase overview, check index freshness | +| `gitnexus://repo/entity-resolution-engine-basic/clusters` | All functional areas | +| `gitnexus://repo/entity-resolution-engine-basic/processes` | All execution flows | +| `gitnexus://repo/entity-resolution-engine-basic/process/{name}` | Step-by-step execution trace | + +## Self-Check Before Finishing + +Before completing any code modification task, verify: +1. `gitnexus_impact` was run for all modified symbols +2. No HIGH/CRITICAL risk warnings were ignored +3. `gitnexus_detect_changes()` confirms changes match expected scope +4. All d=1 (WILL BREAK) dependents were updated + +## Keeping the Index Fresh + +After committing code changes, the GitNexus index becomes stale. Re-run analyze to update it: + +```bash +npx gitnexus analyze +``` + +If the index previously included embeddings, preserve them by adding `--embeddings`: + +```bash +npx gitnexus analyze --embeddings +``` + +To check whether embeddings exist, inspect `.gitnexus/meta.json` — the `stats.embeddings` field shows the count (0 means no embeddings). **Running analyze without `--embeddings` will delete any previously generated embeddings.** + +> Claude Code users: A PostToolUse hook handles this automatically after `git commit` and `git merge`. + +## CLI + +- Re-index: `npx gitnexus analyze` +- Check freshness: `npx gitnexus status` +- Generate docs: `npx gitnexus wiki` + + diff --git a/Makefile b/Makefile index 933f068..d62d251 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,7 @@ build: ## Build the package distribution .PHONY: test test-unit test-integration test-coverage test: ## Run all tests @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Running all tests$(END_BUILD_PRINT)" - @ poetry run pytest $(TEST_PATH) + @ set -a && . $(ENV_FILE) && set +a && poetry run pytest $(TEST_PATH) @ echo -e "$(BUILD_PRINT)$(ICON_DONE) All tests passed$(END_BUILD_PRINT)" test-unit: ## Run unit tests with coverage (fast, uses your venv) @@ -112,14 +112,14 @@ test-unit: ## Run unit tests with coverage (fast, uses your venv) --cov=src --cov-report=term-missing --cov-report=html @ echo -e "$(BUILD_PRINT)$(ICON_DONE) Unit tests passed (coverage: htmlcov/index.html)$(END_BUILD_PRINT)" -test-integration: ## Run integration tests only +test-integration: check-env ## Run integration tests only (requires Redis — run make infra-up first) @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Running integration tests$(END_BUILD_PRINT)" - @ poetry run pytest $(TEST_PATH) -m "integration" + @ set -a && . $(ENV_FILE) && set +a && poetry run pytest $(TEST_PATH) -m "integration" @ echo -e "$(BUILD_PRINT)$(ICON_DONE) Integration tests passed$(END_BUILD_PRINT)" test-coverage: ## Generate detailed HTML coverage report @ echo -e "$(BUILD_PRINT)$(ICON_PROGRESS) Generating coverage report$(END_BUILD_PRINT)" - @ poetry run pytest $(TEST_PATH) -m "not integration" \ + @ set -a && . $(ENV_FILE) && set +a && poetry run pytest $(TEST_PATH) -m "not integration" \ --cov=src --cov-report=html --cov-report=term-missing @ echo -e "$(BUILD_PRINT)$(ICON_DONE) Coverage report: htmlcov/index.html$(END_BUILD_PRINT)" diff --git a/infra/.env.example b/infra/.env.example index 80795f5..e6d0f0c 100644 --- a/infra/.env.example +++ b/infra/.env.example @@ -6,7 +6,7 @@ # When running inside the full ERSys stack, the parent project's .env covers these. # --- Redis --- -REDIS_HOST=redis +REDIS_HOST=localhost REDIS_PORT=6379 REDIS_DB=0 REDIS_PASSWORD=changeme diff --git a/test/e2e/test_app.py b/test/e2e/test_app.py index b15194e..f13ef18 100644 --- a/test/e2e/test_app.py +++ b/test/e2e/test_app.py @@ -42,8 +42,7 @@ def test_app_main_processes_single_request( monkeypatch.setenv("REDIS_HOST", os.environ.get("REDIS_HOST", "localhost")) monkeypatch.setenv("REDIS_PORT", os.environ.get("REDIS_PORT", "6379")) monkeypatch.setenv("REDIS_DB", os.environ.get("REDIS_DB", "0")) - if redis_password := os.environ.get("REDIS_PASSWORD"): - monkeypatch.setenv("REDIS_PASSWORD", redis_password) + monkeypatch.setenv("REDIS_PASSWORD", os.environ.get("REDIS_PASSWORD", "changeme")) monkeypatch.setenv("REQUEST_QUEUE", req_queue) monkeypatch.setenv("RESPONSE_QUEUE", resp_queue) monkeypatch.setenv("RESOLVER_CONFIG_PATH", str(resolver_config_path)) From 0daca1dc6136fb0f86dc5f23c7bf83e8a40baff4 Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 22:17:33 +0200 Subject: [PATCH 13/14] updated project setup --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4e5579..adc2487 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [project] -name = "ere" +name = "ere-basic" version = "0.1.0" description = "A basic implementation of the Entity Resolution Engine (ERE)." authors = [ From e5b1440cfbda2e4fc6f8ccf3fded7e40d98a2b0b Mon Sep 17 00:00:00 2001 From: Eugeniu Costetchi Date: Thu, 2 Apr 2026 22:18:19 +0200 Subject: [PATCH 14/14] updated project setup --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index adc2487..15c2bc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "ere-basic" -version = "0.1.0" +version = "0.4.0" description = "A basic implementation of the Entity Resolution Engine (ERE)." authors = [ {name = "Meaningfy",email = "hi@meaningfy.ws"}