diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index b7760aeb..54862e9f 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -87,3 +87,7 @@ jobs: - name: Unit Test run: | poetry run poe test_unit + + - name: Verb Test + run: | + poetry run poe test_verbs \ No newline at end of file diff --git a/.github/workflows/python-smoke-tests.yml b/.github/workflows/python-smoke-tests.yml index 56dff994..e1c41af2 100644 --- a/.github/workflows/python-smoke-tests.yml +++ b/.github/workflows/python-smoke-tests.yml @@ -47,17 +47,13 @@ jobs: GRAPHRAG_API_VERSION: ${{ secrets.GRAPHRAG_API_VERSION }} GRAPHRAG_LLM_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_LLM_DEPLOYMENT_NAME }} GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME: ${{ secrets.GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME }} - GRAPHRAG_CACHE_CONTAINER_NAME: "cicache" - GRAPHRAG_CACHE_BASE_DIR": "cache" GRAPHRAG_LLM_MODEL: ${{ secrets.GRAPHRAG_LLM_MODEL }} GRAPHRAG_EMBEDDING_MODEL: ${{ secrets.GRAPHRAG_EMBEDDING_MODEL }} - # We have Windows + Linux runners in 3.10 and 3.11, so we need to divide the rate limits by 4 - GRAPHRAG_LLM_TPM: 100_000 # 400_000 / 4 - GRAPHRAG_LLM_RPM: 500 # 2_000 / 4 - GRAPHRAG_EMBEDDING_TPM: 87_500 # 350,000 / 4 - GRAPHRAG_EMBEDDING_RPM: 525 # 2,100 / 4 - GRAPHRAG_CHUNK_SIZE: 1200 - GRAPHRAG_CHUNK_OVERLAP: 0 + # We have Windows + Linux runners in 3.10, so we need to divide the rate limits by 2 + GRAPHRAG_LLM_TPM: 200_000 # 400_000 / 2 + GRAPHRAG_LLM_RPM: 1_000 # 2_000 / 2 + GRAPHRAG_EMBEDDING_TPM: 225_000 # 450_000 / 2 + GRAPHRAG_EMBEDDING_RPM: 1_000 # 2_000 / 2 # Azure AI Search config AZURE_AI_SEARCH_URL_ENDPOINT: ${{ secrets.AZURE_AI_SEARCH_URL_ENDPOINT }} AZURE_AI_SEARCH_API_KEY: ${{ secrets.AZURE_AI_SEARCH_API_KEY }} @@ -101,10 +97,6 @@ jobs: run: | poetry build - - name: Verb Test - run: | - poetry run poe test_verbs - - name: Install Azurite id: azuright uses: potatoqualitee/azuright@v1.1 @@ -118,4 +110,4 @@ jobs: if: always() with: name: smoke-test-artifacts-${{ matrix.python-version }}-${{ matrix.poetry-version }}-${{ runner.os }} - path: tests/fixtures/*/output + path: tests/fixtures/* diff --git a/tests/fixtures/min-csv/config.json b/tests/fixtures/min-csv/config.json index 360eb76b..de496593 100644 --- a/tests/fixtures/min-csv/config.json +++ b/tests/fixtures/min-csv/config.json @@ -3,41 +3,38 @@ "input_file_type": "text", "workflow_config": { "create_base_text_units": { - "row_range": [ - 1, - 2500 - ], - "max_runtime": 150, - "expected_artifacts": 0 + "max_runtime": 30 }, "extract_graph": { - "row_range": [ - 1, - 2500 - ], - "max_runtime": 500, - "expected_artifacts": 0 + "max_runtime": 500 }, "finalize_graph": { "row_range": [ 1, - 2500 + 500 + ], + "nan_allowed_columns": [ + "x", + "y" ], "max_runtime": 30, - "expected_artifacts": 2 + "expected_artifacts": [ + "entities.parquet", + "relationships.parquet" + ] }, "create_communities": { "row_range": [ - 1, - 2500 + 10, + 30 ], - "max_runtime": 150, - "expected_artifacts": 1 + "max_runtime": 30, + "expected_artifacts": ["communities.parquet"] }, "create_community_reports": { "row_range": [ - 1, - 2500 + 10, + 30 ], "nan_allowed_columns": [ "title", @@ -51,35 +48,43 @@ "size" ], "max_runtime": 300, - "expected_artifacts": 1 + "expected_artifacts": ["community_reports.parquet"] }, "create_final_text_units": { "row_range": [ - 1, - 2500 + 10, + 50 ], "nan_allowed_columns": [ "relationship_ids", - "entity_ids" + "entity_ids", + "covariate_ids" ], - "max_runtime": 150, - "expected_artifacts": 1 + "max_runtime": 30, + "expected_artifacts": ["text_units.parquet"] }, "create_final_documents": { "row_range": [ - 1, - 2500 + 15, + 15 ], - "max_runtime": 150, - "expected_artifacts": 1 + "nan_allowed_columns": [ + "metadata" + ], + "max_runtime": 30, + "expected_artifacts": ["documents.parquet"] }, "generate_text_embeddings": { "row_range": [ 1, - 2500 + 500 ], "max_runtime": 150, - "expected_artifacts": 1 + "expected_artifacts": [ + "embeddings.text_unit.text.parquet", + "embeddings.entity.description.parquet", + "embeddings.community.full_content.parquet" + ] } }, "query_config": [ diff --git a/tests/fixtures/min-csv/settings.yml b/tests/fixtures/min-csv/settings.yml index f1877bc2..093ce6b1 100644 --- a/tests/fixtures/min-csv/settings.yml +++ b/tests/fixtures/min-csv/settings.yml @@ -41,5 +41,5 @@ snapshots: drift_search: n_depth: 1 - k_follow_ups: 3 + drift_k_followups: 3 primer_folds: 3 \ No newline at end of file diff --git a/tests/fixtures/text/config.json b/tests/fixtures/text/config.json index 17b9228f..2b983efc 100644 --- a/tests/fixtures/text/config.json +++ b/tests/fixtures/text/config.json @@ -3,33 +3,30 @@ "input_file_type": "text", "workflow_config": { "create_base_text_units": { - "row_range": [ - 1, - 2500 - ], - "max_runtime": 150, - "expected_artifacts": 0 + "max_runtime": 30 }, "extract_graph": { - "row_range": [ - 1, - 2500 - ], - "max_runtime": 500, - "expected_artifacts": 0 + "max_runtime": 500 }, "finalize_graph": { "row_range": [ 1, - 2500 + 100 + ], + "nan_allowed_columns": [ + "x", + "y" ], "max_runtime": 30, - "expected_artifacts": 2 + "expected_artifacts": [ + "entities.parquet", + "relationships.parquet" + ] }, "extract_covariates": { "row_range": [ 1, - 2500 + 100 ], "nan_allowed_columns": [ "type", @@ -41,20 +38,20 @@ "source_text" ], "max_runtime": 300, - "expected_artifacts": 1 + "expected_artifacts": ["covariates.parquet"] }, "create_communities": { "row_range": [ 1, - 2500 + 30 ], - "max_runtime": 150, - "expected_artifacts": 1 + "max_runtime": 30, + "expected_artifacts": ["communities.parquet"] }, "create_community_reports": { "row_range": [ 1, - 2500 + 30 ], "nan_allowed_columns": [ "title", @@ -68,35 +65,43 @@ "size" ], "max_runtime": 300, - "expected_artifacts": 1 + "expected_artifacts": ["community_reports.parquet"] }, "create_final_text_units": { "row_range": [ 1, - 2500 + 10 ], "nan_allowed_columns": [ "relationship_ids", - "entity_ids" + "entity_ids", + "covariate_ids" ], - "max_runtime": 150, - "expected_artifacts": 1 + "max_runtime": 30, + "expected_artifacts": ["text_units.parquet"] }, "create_final_documents": { "row_range": [ 1, - 2500 + 1 ], - "max_runtime": 150, - "expected_artifacts": 1 + "nan_allowed_columns": [ + "metadata" + ], + "max_runtime": 30, + "expected_artifacts": ["documents.parquet"] }, "generate_text_embeddings": { "row_range": [ 1, - 2500 + 100 ], "max_runtime": 150, - "expected_artifacts": 1 + "expected_artifacts": [ + "embeddings.text_unit.text.parquet", + "embeddings.entity.description.parquet", + "embeddings.community.full_content.parquet" + ] } }, "query_config": [ diff --git a/tests/fixtures/text/settings.yml b/tests/fixtures/text/settings.yml index c5c4fe74..6a5f8135 100644 --- a/tests/fixtures/text/settings.yml +++ b/tests/fixtures/text/settings.yml @@ -45,5 +45,5 @@ snapshots: drift_search: n_depth: 1 - k_follow_ups: 3 + drift_k_followups: 3 primer_folds: 3 diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py index c90f15a2..39e5e494 100644 --- a/tests/smoke/test_fixtures.py +++ b/tests/smoke/test_fixtures.py @@ -160,7 +160,6 @@ class TestIndexer: stats = json.loads((output_path / "stats.json").read_bytes().decode("utf-8")) # Check all workflows run - expected_artifacts = 0 expected_workflows = set(workflow_config.keys()) workflows = set(stats["workflows"].keys()) assert workflows == expected_workflows, ( @@ -168,56 +167,38 @@ class TestIndexer: ) # [OPTIONAL] Check runtime - for workflow in expected_workflows: + for workflow, config in workflow_config.items(): # Check expected artifacts - expected_artifacts = expected_artifacts + workflow_config[workflow].get( - "expected_artifacts", 1 - ) + workflow_artifacts = config.get("expected_artifacts", []) # Check max runtime - max_runtime = workflow_config[workflow].get("max_runtime", None) + max_runtime = config.get("max_runtime", None) if max_runtime: assert stats["workflows"][workflow]["overall"] <= max_runtime, ( f"Expected max runtime of {max_runtime}, found: {stats['workflows'][workflow]['overall']} for workflow: {workflow}" ) - - # Check artifacts - artifact_files = os.listdir(output_path) - - # check that the number of workflows matches the number of artifacts - assert len(artifact_files) == (expected_artifacts + 3), ( - f"Expected {expected_artifacts + 3} artifacts, found: {len(artifact_files)}" - ) # Embeddings add to the count - - for artifact in artifact_files: - if artifact.endswith(".parquet"): - output_df = pd.read_parquet(output_path / artifact) - artifact_name = artifact.split(".")[0] - - try: - workflow = workflow_config[artifact_name] + # Check expected artifacts + for artifact in workflow_artifacts: + if artifact.endswith(".parquet"): + output_df = pd.read_parquet(output_path / artifact) # Check number of rows between range assert ( - workflow["row_range"][0] + config["row_range"][0] <= len(output_df) - <= workflow["row_range"][1] + <= config["row_range"][1] ), ( - f"Expected between {workflow['row_range'][0]} and {workflow['row_range'][1]}, found: {len(output_df)} for file: {artifact}" + f"Expected between {config['row_range'][0]} and {config['row_range'][1]}, found: {len(output_df)} for file: {artifact}" ) # Get non-nan rows nan_df = output_df.loc[ :, - ~output_df.columns.isin( - workflow.get("nan_allowed_columns", []) - ), + ~output_df.columns.isin(config.get("nan_allowed_columns", [])), ] nan_df = nan_df[nan_df.isna().any(axis=1)] assert len(nan_df) == 0, ( f"Found {len(nan_df)} rows with NaN values for file: {artifact} on columns: {nan_df.columns[nan_df.isna().any()].tolist()}" ) - except KeyError: - log.warning("No workflow config found %s", artifact_name) def __run_query(self, root: Path, query_config: dict[str, str]): command = [