mirror of
https://github.com/baz-scm/awesome-reviewers.git
synced 2025-08-20 18:58:52 +03:00
70 lines
33 KiB
JSON
70 lines
33 KiB
JSON
[
|
|
{
|
|
"discussion_id": "2281898593",
|
|
"pr_number": 36585,
|
|
"pr_file": "dags/max_ai/snapshot_project_data.py",
|
|
"created_at": "2025-08-18T09:52:02+00:00",
|
|
"commented_code": "+from collections.abc import Callable, Iterable, Iterator\n+from itertools import islice\n+from typing import TypeVar\n+\n+import dagster\n+from dagster_aws.s3.resources import S3Resource\n+from tenacity import (\n+ retry,\n+ retry_if_exception_type,\n+ stop_after_attempt,\n+ wait_exponential,\n+)\n+\n+from dags.common import JobOwners\n+from dags.max_ai.utils import (\n+ check_dump_exists,\n+ compose_clickhouse_dump_path,\n+ compose_postgres_dump_path,\n+ dump_model,\n+)\n+from ee.hogai.eval.schema import (\n+ ActorsPropertyTaxonomySnapshot,\n+ BaseSnapshot,\n+ ClickhouseProjectDataSnapshot,\n+ DataWarehouseTableSnapshot,\n+ GroupTypeMappingSnapshot,\n+ PostgresProjectDataSnapshot,\n+ PropertyDefinitionSnapshot,\n+ PropertyTaxonomySnapshot,\n+ TeamSnapshot,\n+ TeamTaxonomyItemSnapshot,\n+)\n+from posthog.errors import InternalCHQueryError\n+from posthog.hogql_queries.ai.actors_property_taxonomy_query_runner import (\n+ ActorsPropertyTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.event_taxonomy_query_runner import (\n+ EventTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.team_taxonomy_query_runner import TeamTaxonomyQueryRunner\n+from posthog.models import GroupTypeMapping, Team\n+from posthog.models.property_definition import PropertyDefinition\n+from posthog.schema import (\n+ ActorsPropertyTaxonomyQuery,\n+ EventTaxonomyQuery,\n+ TeamTaxonomyItem,\n+ TeamTaxonomyQuery,\n+)\n+\n+DEFAULT_RETRY_POLICY = dagster.RetryPolicy(\n+ max_retries=4,\n+ delay=2, # 2 seconds\n+ backoff=dagster.Backoff.EXPONENTIAL,\n+ jitter=dagster.Jitter.PLUS_MINUS,\n+)\n+\n+\n+SchemaBound = TypeVar(\"SchemaBound\", bound=BaseSnapshot)\n+\n+\n+def snapshot_postgres_model(\n+ context: dagster.OpExecutionContext,\n+ model_type: type[SchemaBound],\n+ file_name: str,\n+ s3: S3Resource,\n+ project_id: int,\n+ code_version: str | None = None,\n+) -> str:\n+ file_key = compose_postgres_dump_path(project_id, file_name, code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping {file_key} because it already exists\")\n+ return file_key\n+ context.log.info(f\"Dumping {file_key}\")\n+ with dump_model(s3=s3, schema=model_type, file_key=file_key) as dump:\n+ dump(model_type.serialize_for_project(project_id))\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots Postgres project data (property definitions, DWH schema, etc.)\",\n+ retry_policy=DEFAULT_RETRY_POLICY,\n+ code_version=\"v1\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+)\n+def snapshot_postgres_project_data(\n+ context: dagster.OpExecutionContext, project_id: int, s3: S3Resource\n+) -> PostgresProjectDataSnapshot:\n+ context.log.info(f\"Snapshotting Postgres project data for {project_id}\")\n+ snapshot_map: dict[str, type[BaseSnapshot]] = {\n+ \"project\": TeamSnapshot,\n+ \"property_definitions\": PropertyDefinitionSnapshot,\n+ \"group_type_mappings\": GroupTypeMappingSnapshot,\n+ \"data_warehouse_tables\": DataWarehouseTableSnapshot,\n+ }\n+ deps = {\n+ file_name: snapshot_postgres_model(context, model_type, file_name, s3, project_id, context.op_def.version)\n+ for file_name, model_type in snapshot_map.items()\n+ }\n+ context.log_event(\n+ dagster.AssetMaterialization(\n+ asset_key=\"project_postgres_snapshot\",\n+ description=\"Avro snapshots of project Postgres data\",\n+ metadata={\"project_id\": project_id, **deps},\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ )\n+ )\n+ return PostgresProjectDataSnapshot(**deps)\n+\n+\n+C = TypeVar(\"C\")\n+\n+\n+@retry(retry=retry_if_exception_type(InternalCHQueryError), stop=stop_after_attempt(4), wait=wait_exponential(min=8))\n+def call_query_runner(callable: Callable[[], C]) -> C:\n+ return callable()\n+\n+\n+def snapshot_properties_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, file_key: str, team: Team, events: list[TeamTaxonomyItem]\n+):\n+ results: list[PropertyTaxonomySnapshot] = []\n+\n+ def snapshot_event(item: TeamTaxonomyItem):\n+ return call_query_runner(\n+ lambda: EventTaxonomyQueryRunner(\n+ query=EventTaxonomyQuery(event=item.event),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ for item in events:\n+ context.log.info(f\"Snapshotting properties taxonomy for event {item.event} of {team.id}\")\n+ results.append(PropertyTaxonomySnapshot(event=item.event, results=snapshot_event(item).results))\n+\n+ context.log.info(f\"Dumping properties taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=PropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+\n+\n+def snapshot_events_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ # Check if files are cached\n+ events_file_key = compose_clickhouse_dump_path(team.id, \"events_taxonomy\", code_version=code_version)\n+ properties_file_key = compose_clickhouse_dump_path(team.id, \"properties_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, events_file_key) and check_dump_exists(s3, properties_file_key):\n+ context.log.info(f\"Skipping events and properties taxonomy snapshot for {team.id} because it already exists\")\n+ return events_file_key, properties_file_key\n+\n+ context.log.info(f\"Snapshotting events taxonomy for {team.id}\")\n+\n+ res = call_query_runner(lambda: TeamTaxonomyQueryRunner(query=TeamTaxonomyQuery(), team=team).calculate())\n+ if not res.results:\n+ raise ValueError(\"No results from events taxonomy query\")\n+\n+ # Dump properties\n+ snapshot_properties_taxonomy(context, s3, properties_file_key, team, res.results)\n+\n+ # Dump later to ensure caching\n+ with dump_model(s3=s3, schema=TeamTaxonomyItemSnapshot, file_key=events_file_key) as dump:\n+ dumped_items = TeamTaxonomyItemSnapshot(results=res.results)\n+ dump([dumped_items])\n+\n+ return events_file_key, properties_file_key\n+\n+\n+T = TypeVar(\"T\")\n+\n+\n+def chunked(iterable: Iterable[T], size: int = 200) -> Iterator[list[T]]:\n+ it = iter(iterable)\n+ while True:\n+ batch = list(islice(it, size))\n+ if not batch:\n+ break\n+ yield batch\n+\n+\n+def snapshot_actors_property_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ file_key = compose_clickhouse_dump_path(team.id, \"actors_property_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping actors property taxonomy snapshot for {team.id} because it already exists\")\n+ return file_key\n+\n+ # Snapshot all group type mappings and person\n+ results: list[ActorsPropertyTaxonomySnapshot] = []\n+ group_type_mappings: list[int | None] = [\n+ None,\n+ *(g.group_type_index for g in GroupTypeMapping.objects.filter(team=team)),\n+ ]\n+\n+ for index in group_type_mappings:\n+ is_group = index is not None\n+ log_entity = f\"group type {index}\" if is_group else \"persons\"\n+ context.log.info(f\"Snapshotting properties taxonomy for {log_entity}\")\n+\n+ # Retrieve saved property definitions for the group type or person\n+ property_defs = (\n+ PropertyDefinition.objects.filter(\n+ team=team,\n+ type=PropertyDefinition.Type.GROUP if is_group else PropertyDefinition.Type.PERSON,\n+ group_type_index=index,\n+ )\n+ .values_list(\"name\", flat=True)\n+ .iterator(chunk_size=200)\n+ )\n+\n+ # Query ClickHouse in batches of 200 properties\n+ for batch in chunked(property_defs, 200):\n+\n+ def snapshot(index: int | None, batch: list[str]):\n+ return call_query_runner(\n+ lambda: ActorsPropertyTaxonomyQueryRunner(\n+ query=ActorsPropertyTaxonomyQuery(groupTypeIndex=index, properties=batch, maxPropertyValues=25),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ res = snapshot(index, batch)\n+\n+ if not res.results:\n+ raise ValueError(\n+ f\"No results from actors property taxonomy query for group type {index} and properties {batch}\"\n+ )\n+\n+ # Snapshot queries in the same way as the toolkit expects\n+ for prop, prop_results in zip(batch, res.results):\n+ results.append(\n+ ActorsPropertyTaxonomySnapshot(property=prop, group_type_index=index, results=prop_results)\n+ )\n+\n+ context.log.info(f\"Dumping actors property taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=ActorsPropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots ClickHouse project data\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ code_version=\"v1\",\n+)\n+def snapshot_clickhouse_project_data(",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"discussion_comments": [
|
|
{
|
|
"comment_id": "2281898593",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"pr_number": 36585,
|
|
"pr_file": "dags/max_ai/snapshot_project_data.py",
|
|
"discussion_id": "2281898593",
|
|
"commented_code": "@@ -0,0 +1,273 @@\n+from collections.abc import Callable, Iterable, Iterator\n+from itertools import islice\n+from typing import TypeVar\n+\n+import dagster\n+from dagster_aws.s3.resources import S3Resource\n+from tenacity import (\n+ retry,\n+ retry_if_exception_type,\n+ stop_after_attempt,\n+ wait_exponential,\n+)\n+\n+from dags.common import JobOwners\n+from dags.max_ai.utils import (\n+ check_dump_exists,\n+ compose_clickhouse_dump_path,\n+ compose_postgres_dump_path,\n+ dump_model,\n+)\n+from ee.hogai.eval.schema import (\n+ ActorsPropertyTaxonomySnapshot,\n+ BaseSnapshot,\n+ ClickhouseProjectDataSnapshot,\n+ DataWarehouseTableSnapshot,\n+ GroupTypeMappingSnapshot,\n+ PostgresProjectDataSnapshot,\n+ PropertyDefinitionSnapshot,\n+ PropertyTaxonomySnapshot,\n+ TeamSnapshot,\n+ TeamTaxonomyItemSnapshot,\n+)\n+from posthog.errors import InternalCHQueryError\n+from posthog.hogql_queries.ai.actors_property_taxonomy_query_runner import (\n+ ActorsPropertyTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.event_taxonomy_query_runner import (\n+ EventTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.team_taxonomy_query_runner import TeamTaxonomyQueryRunner\n+from posthog.models import GroupTypeMapping, Team\n+from posthog.models.property_definition import PropertyDefinition\n+from posthog.schema import (\n+ ActorsPropertyTaxonomyQuery,\n+ EventTaxonomyQuery,\n+ TeamTaxonomyItem,\n+ TeamTaxonomyQuery,\n+)\n+\n+DEFAULT_RETRY_POLICY = dagster.RetryPolicy(\n+ max_retries=4,\n+ delay=2, # 2 seconds\n+ backoff=dagster.Backoff.EXPONENTIAL,\n+ jitter=dagster.Jitter.PLUS_MINUS,\n+)\n+\n+\n+SchemaBound = TypeVar(\"SchemaBound\", bound=BaseSnapshot)\n+\n+\n+def snapshot_postgres_model(\n+ context: dagster.OpExecutionContext,\n+ model_type: type[SchemaBound],\n+ file_name: str,\n+ s3: S3Resource,\n+ project_id: int,\n+ code_version: str | None = None,\n+) -> str:\n+ file_key = compose_postgres_dump_path(project_id, file_name, code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping {file_key} because it already exists\")\n+ return file_key\n+ context.log.info(f\"Dumping {file_key}\")\n+ with dump_model(s3=s3, schema=model_type, file_key=file_key) as dump:\n+ dump(model_type.serialize_for_project(project_id))\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots Postgres project data (property definitions, DWH schema, etc.)\",\n+ retry_policy=DEFAULT_RETRY_POLICY,\n+ code_version=\"v1\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+)\n+def snapshot_postgres_project_data(\n+ context: dagster.OpExecutionContext, project_id: int, s3: S3Resource\n+) -> PostgresProjectDataSnapshot:\n+ context.log.info(f\"Snapshotting Postgres project data for {project_id}\")\n+ snapshot_map: dict[str, type[BaseSnapshot]] = {\n+ \"project\": TeamSnapshot,\n+ \"property_definitions\": PropertyDefinitionSnapshot,\n+ \"group_type_mappings\": GroupTypeMappingSnapshot,\n+ \"data_warehouse_tables\": DataWarehouseTableSnapshot,\n+ }\n+ deps = {\n+ file_name: snapshot_postgres_model(context, model_type, file_name, s3, project_id, context.op_def.version)\n+ for file_name, model_type in snapshot_map.items()\n+ }\n+ context.log_event(\n+ dagster.AssetMaterialization(\n+ asset_key=\"project_postgres_snapshot\",\n+ description=\"Avro snapshots of project Postgres data\",\n+ metadata={\"project_id\": project_id, **deps},\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ )\n+ )\n+ return PostgresProjectDataSnapshot(**deps)\n+\n+\n+C = TypeVar(\"C\")\n+\n+\n+@retry(retry=retry_if_exception_type(InternalCHQueryError), stop=stop_after_attempt(4), wait=wait_exponential(min=8))\n+def call_query_runner(callable: Callable[[], C]) -> C:\n+ return callable()\n+\n+\n+def snapshot_properties_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, file_key: str, team: Team, events: list[TeamTaxonomyItem]\n+):\n+ results: list[PropertyTaxonomySnapshot] = []\n+\n+ def snapshot_event(item: TeamTaxonomyItem):\n+ return call_query_runner(\n+ lambda: EventTaxonomyQueryRunner(\n+ query=EventTaxonomyQuery(event=item.event),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ for item in events:\n+ context.log.info(f\"Snapshotting properties taxonomy for event {item.event} of {team.id}\")\n+ results.append(PropertyTaxonomySnapshot(event=item.event, results=snapshot_event(item).results))\n+\n+ context.log.info(f\"Dumping properties taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=PropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+\n+\n+def snapshot_events_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ # Check if files are cached\n+ events_file_key = compose_clickhouse_dump_path(team.id, \"events_taxonomy\", code_version=code_version)\n+ properties_file_key = compose_clickhouse_dump_path(team.id, \"properties_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, events_file_key) and check_dump_exists(s3, properties_file_key):\n+ context.log.info(f\"Skipping events and properties taxonomy snapshot for {team.id} because it already exists\")\n+ return events_file_key, properties_file_key\n+\n+ context.log.info(f\"Snapshotting events taxonomy for {team.id}\")\n+\n+ res = call_query_runner(lambda: TeamTaxonomyQueryRunner(query=TeamTaxonomyQuery(), team=team).calculate())\n+ if not res.results:\n+ raise ValueError(\"No results from events taxonomy query\")\n+\n+ # Dump properties\n+ snapshot_properties_taxonomy(context, s3, properties_file_key, team, res.results)\n+\n+ # Dump later to ensure caching\n+ with dump_model(s3=s3, schema=TeamTaxonomyItemSnapshot, file_key=events_file_key) as dump:\n+ dumped_items = TeamTaxonomyItemSnapshot(results=res.results)\n+ dump([dumped_items])\n+\n+ return events_file_key, properties_file_key\n+\n+\n+T = TypeVar(\"T\")\n+\n+\n+def chunked(iterable: Iterable[T], size: int = 200) -> Iterator[list[T]]:\n+ it = iter(iterable)\n+ while True:\n+ batch = list(islice(it, size))\n+ if not batch:\n+ break\n+ yield batch\n+\n+\n+def snapshot_actors_property_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ file_key = compose_clickhouse_dump_path(team.id, \"actors_property_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping actors property taxonomy snapshot for {team.id} because it already exists\")\n+ return file_key\n+\n+ # Snapshot all group type mappings and person\n+ results: list[ActorsPropertyTaxonomySnapshot] = []\n+ group_type_mappings: list[int | None] = [\n+ None,\n+ *(g.group_type_index for g in GroupTypeMapping.objects.filter(team=team)),\n+ ]\n+\n+ for index in group_type_mappings:\n+ is_group = index is not None\n+ log_entity = f\"group type {index}\" if is_group else \"persons\"\n+ context.log.info(f\"Snapshotting properties taxonomy for {log_entity}\")\n+\n+ # Retrieve saved property definitions for the group type or person\n+ property_defs = (\n+ PropertyDefinition.objects.filter(\n+ team=team,\n+ type=PropertyDefinition.Type.GROUP if is_group else PropertyDefinition.Type.PERSON,\n+ group_type_index=index,\n+ )\n+ .values_list(\"name\", flat=True)\n+ .iterator(chunk_size=200)\n+ )\n+\n+ # Query ClickHouse in batches of 200 properties\n+ for batch in chunked(property_defs, 200):\n+\n+ def snapshot(index: int | None, batch: list[str]):\n+ return call_query_runner(\n+ lambda: ActorsPropertyTaxonomyQueryRunner(\n+ query=ActorsPropertyTaxonomyQuery(groupTypeIndex=index, properties=batch, maxPropertyValues=25),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ res = snapshot(index, batch)\n+\n+ if not res.results:\n+ raise ValueError(\n+ f\"No results from actors property taxonomy query for group type {index} and properties {batch}\"\n+ )\n+\n+ # Snapshot queries in the same way as the toolkit expects\n+ for prop, prop_results in zip(batch, res.results):\n+ results.append(\n+ ActorsPropertyTaxonomySnapshot(property=prop, group_type_index=index, results=prop_results)\n+ )\n+\n+ context.log.info(f\"Dumping actors property taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=ActorsPropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots ClickHouse project data\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ code_version=\"v1\",\n+)\n+def snapshot_clickhouse_project_data(",
|
|
"comment_created_at": "2025-08-18T09:52:02+00:00",
|
|
"comment_author": "denakorita",
|
|
"comment_body": "Me overthinking probably: \r\n\r\nI am new with dagster, but from the docs it seems like the current state we have does not allow for any resume upon failure of one of the steps (right now we dont have a steps definition or sth similar). So if we succeed at `snapshot_events_taxonomy` but fail at `snapshot_actors_property_taxonomy` we cannot resume just the `snapshot_actors_property_taxonomy`...nothing tragic really cuz the `check_dump_exists` helps us avoid re-querying, but I think it is much nicer to separate the steps, to take advantage of Dagster's built-in retry/restart capabilities. It enables us also to just update one thing at a time if we ever need to do so.",
|
|
"pr_file_module": null
|
|
},
|
|
{
|
|
"comment_id": "2284698523",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"pr_number": 36585,
|
|
"pr_file": "dags/max_ai/snapshot_project_data.py",
|
|
"discussion_id": "2281898593",
|
|
"commented_code": "@@ -0,0 +1,273 @@\n+from collections.abc import Callable, Iterable, Iterator\n+from itertools import islice\n+from typing import TypeVar\n+\n+import dagster\n+from dagster_aws.s3.resources import S3Resource\n+from tenacity import (\n+ retry,\n+ retry_if_exception_type,\n+ stop_after_attempt,\n+ wait_exponential,\n+)\n+\n+from dags.common import JobOwners\n+from dags.max_ai.utils import (\n+ check_dump_exists,\n+ compose_clickhouse_dump_path,\n+ compose_postgres_dump_path,\n+ dump_model,\n+)\n+from ee.hogai.eval.schema import (\n+ ActorsPropertyTaxonomySnapshot,\n+ BaseSnapshot,\n+ ClickhouseProjectDataSnapshot,\n+ DataWarehouseTableSnapshot,\n+ GroupTypeMappingSnapshot,\n+ PostgresProjectDataSnapshot,\n+ PropertyDefinitionSnapshot,\n+ PropertyTaxonomySnapshot,\n+ TeamSnapshot,\n+ TeamTaxonomyItemSnapshot,\n+)\n+from posthog.errors import InternalCHQueryError\n+from posthog.hogql_queries.ai.actors_property_taxonomy_query_runner import (\n+ ActorsPropertyTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.event_taxonomy_query_runner import (\n+ EventTaxonomyQueryRunner,\n+)\n+from posthog.hogql_queries.ai.team_taxonomy_query_runner import TeamTaxonomyQueryRunner\n+from posthog.models import GroupTypeMapping, Team\n+from posthog.models.property_definition import PropertyDefinition\n+from posthog.schema import (\n+ ActorsPropertyTaxonomyQuery,\n+ EventTaxonomyQuery,\n+ TeamTaxonomyItem,\n+ TeamTaxonomyQuery,\n+)\n+\n+DEFAULT_RETRY_POLICY = dagster.RetryPolicy(\n+ max_retries=4,\n+ delay=2, # 2 seconds\n+ backoff=dagster.Backoff.EXPONENTIAL,\n+ jitter=dagster.Jitter.PLUS_MINUS,\n+)\n+\n+\n+SchemaBound = TypeVar(\"SchemaBound\", bound=BaseSnapshot)\n+\n+\n+def snapshot_postgres_model(\n+ context: dagster.OpExecutionContext,\n+ model_type: type[SchemaBound],\n+ file_name: str,\n+ s3: S3Resource,\n+ project_id: int,\n+ code_version: str | None = None,\n+) -> str:\n+ file_key = compose_postgres_dump_path(project_id, file_name, code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping {file_key} because it already exists\")\n+ return file_key\n+ context.log.info(f\"Dumping {file_key}\")\n+ with dump_model(s3=s3, schema=model_type, file_key=file_key) as dump:\n+ dump(model_type.serialize_for_project(project_id))\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots Postgres project data (property definitions, DWH schema, etc.)\",\n+ retry_policy=DEFAULT_RETRY_POLICY,\n+ code_version=\"v1\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+)\n+def snapshot_postgres_project_data(\n+ context: dagster.OpExecutionContext, project_id: int, s3: S3Resource\n+) -> PostgresProjectDataSnapshot:\n+ context.log.info(f\"Snapshotting Postgres project data for {project_id}\")\n+ snapshot_map: dict[str, type[BaseSnapshot]] = {\n+ \"project\": TeamSnapshot,\n+ \"property_definitions\": PropertyDefinitionSnapshot,\n+ \"group_type_mappings\": GroupTypeMappingSnapshot,\n+ \"data_warehouse_tables\": DataWarehouseTableSnapshot,\n+ }\n+ deps = {\n+ file_name: snapshot_postgres_model(context, model_type, file_name, s3, project_id, context.op_def.version)\n+ for file_name, model_type in snapshot_map.items()\n+ }\n+ context.log_event(\n+ dagster.AssetMaterialization(\n+ asset_key=\"project_postgres_snapshot\",\n+ description=\"Avro snapshots of project Postgres data\",\n+ metadata={\"project_id\": project_id, **deps},\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ )\n+ )\n+ return PostgresProjectDataSnapshot(**deps)\n+\n+\n+C = TypeVar(\"C\")\n+\n+\n+@retry(retry=retry_if_exception_type(InternalCHQueryError), stop=stop_after_attempt(4), wait=wait_exponential(min=8))\n+def call_query_runner(callable: Callable[[], C]) -> C:\n+ return callable()\n+\n+\n+def snapshot_properties_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, file_key: str, team: Team, events: list[TeamTaxonomyItem]\n+):\n+ results: list[PropertyTaxonomySnapshot] = []\n+\n+ def snapshot_event(item: TeamTaxonomyItem):\n+ return call_query_runner(\n+ lambda: EventTaxonomyQueryRunner(\n+ query=EventTaxonomyQuery(event=item.event),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ for item in events:\n+ context.log.info(f\"Snapshotting properties taxonomy for event {item.event} of {team.id}\")\n+ results.append(PropertyTaxonomySnapshot(event=item.event, results=snapshot_event(item).results))\n+\n+ context.log.info(f\"Dumping properties taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=PropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+\n+\n+def snapshot_events_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ # Check if files are cached\n+ events_file_key = compose_clickhouse_dump_path(team.id, \"events_taxonomy\", code_version=code_version)\n+ properties_file_key = compose_clickhouse_dump_path(team.id, \"properties_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, events_file_key) and check_dump_exists(s3, properties_file_key):\n+ context.log.info(f\"Skipping events and properties taxonomy snapshot for {team.id} because it already exists\")\n+ return events_file_key, properties_file_key\n+\n+ context.log.info(f\"Snapshotting events taxonomy for {team.id}\")\n+\n+ res = call_query_runner(lambda: TeamTaxonomyQueryRunner(query=TeamTaxonomyQuery(), team=team).calculate())\n+ if not res.results:\n+ raise ValueError(\"No results from events taxonomy query\")\n+\n+ # Dump properties\n+ snapshot_properties_taxonomy(context, s3, properties_file_key, team, res.results)\n+\n+ # Dump later to ensure caching\n+ with dump_model(s3=s3, schema=TeamTaxonomyItemSnapshot, file_key=events_file_key) as dump:\n+ dumped_items = TeamTaxonomyItemSnapshot(results=res.results)\n+ dump([dumped_items])\n+\n+ return events_file_key, properties_file_key\n+\n+\n+T = TypeVar(\"T\")\n+\n+\n+def chunked(iterable: Iterable[T], size: int = 200) -> Iterator[list[T]]:\n+ it = iter(iterable)\n+ while True:\n+ batch = list(islice(it, size))\n+ if not batch:\n+ break\n+ yield batch\n+\n+\n+def snapshot_actors_property_taxonomy(\n+ context: dagster.OpExecutionContext, s3: S3Resource, team: Team, code_version: str | None = None\n+):\n+ file_key = compose_clickhouse_dump_path(team.id, \"actors_property_taxonomy\", code_version=code_version)\n+ if check_dump_exists(s3, file_key):\n+ context.log.info(f\"Skipping actors property taxonomy snapshot for {team.id} because it already exists\")\n+ return file_key\n+\n+ # Snapshot all group type mappings and person\n+ results: list[ActorsPropertyTaxonomySnapshot] = []\n+ group_type_mappings: list[int | None] = [\n+ None,\n+ *(g.group_type_index for g in GroupTypeMapping.objects.filter(team=team)),\n+ ]\n+\n+ for index in group_type_mappings:\n+ is_group = index is not None\n+ log_entity = f\"group type {index}\" if is_group else \"persons\"\n+ context.log.info(f\"Snapshotting properties taxonomy for {log_entity}\")\n+\n+ # Retrieve saved property definitions for the group type or person\n+ property_defs = (\n+ PropertyDefinition.objects.filter(\n+ team=team,\n+ type=PropertyDefinition.Type.GROUP if is_group else PropertyDefinition.Type.PERSON,\n+ group_type_index=index,\n+ )\n+ .values_list(\"name\", flat=True)\n+ .iterator(chunk_size=200)\n+ )\n+\n+ # Query ClickHouse in batches of 200 properties\n+ for batch in chunked(property_defs, 200):\n+\n+ def snapshot(index: int | None, batch: list[str]):\n+ return call_query_runner(\n+ lambda: ActorsPropertyTaxonomyQueryRunner(\n+ query=ActorsPropertyTaxonomyQuery(groupTypeIndex=index, properties=batch, maxPropertyValues=25),\n+ team=team,\n+ ).calculate()\n+ )\n+\n+ res = snapshot(index, batch)\n+\n+ if not res.results:\n+ raise ValueError(\n+ f\"No results from actors property taxonomy query for group type {index} and properties {batch}\"\n+ )\n+\n+ # Snapshot queries in the same way as the toolkit expects\n+ for prop, prop_results in zip(batch, res.results):\n+ results.append(\n+ ActorsPropertyTaxonomySnapshot(property=prop, group_type_index=index, results=prop_results)\n+ )\n+\n+ context.log.info(f\"Dumping actors property taxonomy to {file_key}\")\n+ with dump_model(s3=s3, schema=ActorsPropertyTaxonomySnapshot, file_key=file_key) as dump:\n+ dump(results)\n+ return file_key\n+\n+\n+@dagster.op(\n+ description=\"Snapshots ClickHouse project data\",\n+ tags={\"owner\": JobOwners.TEAM_MAX_AI.value},\n+ code_version=\"v1\",\n+)\n+def snapshot_clickhouse_project_data(",
|
|
"comment_created_at": "2025-08-19T09:35:33+00:00",
|
|
"comment_author": "skoob13",
|
|
"comment_body": "Yes, you're right. Retries and parallelization are not ideal now. I've started implementing this with `graph_asset`, where we can parallelize execution through a partition_key, but it seemed to me like overkill now. Let's revisit it later because I don't yet understand the patterns and data we'll use for evaluations. It should work for the first iteration.",
|
|
"pr_file_module": null
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"discussion_id": "2260162092",
|
|
"pr_number": 36303,
|
|
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
|
|
"created_at": "2025-08-07T12:25:19+00:00",
|
|
"commented_code": "def get_metric_meter(additional_attributes: Attributes | None = None) -> MetricMeter:\n \"\"\"Return a meter depending on in which context we are.\"\"\"\n- attributes = get_attributes(additional_attributes)\n-\n if activity.in_activity():\n meter = activity.metric_meter()\n elif workflow.in_workflow():\n meter = workflow.metric_meter()\n else:\n raise RuntimeError(\"Not within workflow or activity context\")\n \n- meter = meter.with_additional_attributes(attributes)\n-\n- return meter\n-\n-\n-def get_attributes(additional_attributes: Attributes | None = None) -> Attributes:",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"discussion_comments": [
|
|
{
|
|
"comment_id": "2260162092",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"pr_number": 36303,
|
|
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
|
|
"discussion_id": "2260162092",
|
|
"commented_code": "@@ -249,54 +251,17 @@ def reset(self):\n \n def get_metric_meter(additional_attributes: Attributes | None = None) -> MetricMeter:\n \"\"\"Return a meter depending on in which context we are.\"\"\"\n- attributes = get_attributes(additional_attributes)\n-\n if activity.in_activity():\n meter = activity.metric_meter()\n elif workflow.in_workflow():\n meter = workflow.metric_meter()\n else:\n raise RuntimeError(\"Not within workflow or activity context\")\n \n- meter = meter.with_additional_attributes(attributes)\n-\n- return meter\n-\n-\n-def get_attributes(additional_attributes: Attributes | None = None) -> Attributes:",
|
|
"comment_created_at": "2025-08-07T12:25:19+00:00",
|
|
"comment_author": "tomasfarias",
|
|
"comment_body": "Getting rid of this because temporal sets `activity_type` `workflow_type` `task_queue` and `namespace` on all metrics made from their meters. So, we don't need to pass them as additional attributes.",
|
|
"pr_file_module": null
|
|
},
|
|
{
|
|
"comment_id": "2260188111",
|
|
"repo_full_name": "PostHog/posthog",
|
|
"pr_number": 36303,
|
|
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
|
|
"discussion_id": "2260162092",
|
|
"commented_code": "@@ -249,54 +251,17 @@ def reset(self):\n \n def get_metric_meter(additional_attributes: Attributes | None = None) -> MetricMeter:\n \"\"\"Return a meter depending on in which context we are.\"\"\"\n- attributes = get_attributes(additional_attributes)\n-\n if activity.in_activity():\n meter = activity.metric_meter()\n elif workflow.in_workflow():\n meter = workflow.metric_meter()\n else:\n raise RuntimeError(\"Not within workflow or activity context\")\n \n- meter = meter.with_additional_attributes(attributes)\n-\n- return meter\n-\n-\n-def get_attributes(additional_attributes: Attributes | None = None) -> Attributes:",
|
|
"comment_created_at": "2025-08-07T12:35:53+00:00",
|
|
"comment_author": "tomasfarias",
|
|
"comment_body": "From here: https://github.com/temporalio/sdk-python#metrics",
|
|
"pr_file_module": null
|
|
}
|
|
]
|
|
}
|
|
] |