Files
awesome-reviewers/_reviewers/posthog-cache-expensive-operations.json
2025-08-19 12:19:58 +00:00

232 lines
24 KiB
JSON

[
{
"discussion_id": "2277914735",
"pr_number": 36608,
"pr_file": "posthog/hogql/database/database.py",
"created_at": "2025-08-14T23:12:28+00:00",
"commented_code": "with timings.measure(\"data_warehouse_tables\"):\n with timings.measure(\"select\"):\n- tables = list(\n- DataWarehouseTable.objects.filter(team_id=team.pk)\n- .exclude(deleted=True)\n- .select_related(\"credential\", \"external_data_source\")\n- )\n+ if cache_enabled:\n+ tables = list(\n+ DataWarehouseTable.objects.filter(team_id=team.pk)\n+ .exclude(deleted=True)\n+ .select_related(\"credential\", \"external_data_source\")\n+ .fetch_cached(team_id=team_id or team.pk, key_prefix=CACHE_KEY_PREFIX)\n+ )\n+ else:\n+ tables = list(\n+ DataWarehouseTable.objects.filter(team_id=team.pk)\n+ .exclude(deleted=True)\n+ .select_related(\"credential\", \"external_data_source\")\n+ )",
"repo_full_name": "PostHog/posthog",
"discussion_comments": [
{
"comment_id": "2277914735",
"repo_full_name": "PostHog/posthog",
"pr_number": 36608,
"pr_file": "posthog/hogql/database/database.py",
"discussion_id": "2277914735",
"commented_code": "@@ -541,11 +563,19 @@ def create_hogql_database(\n \n with timings.measure(\"data_warehouse_tables\"):\n with timings.measure(\"select\"):\n- tables = list(\n- DataWarehouseTable.objects.filter(team_id=team.pk)\n- .exclude(deleted=True)\n- .select_related(\"credential\", \"external_data_source\")\n- )\n+ if cache_enabled:\n+ tables = list(\n+ DataWarehouseTable.objects.filter(team_id=team.pk)\n+ .exclude(deleted=True)\n+ .select_related(\"credential\", \"external_data_source\")\n+ .fetch_cached(team_id=team_id or team.pk, key_prefix=CACHE_KEY_PREFIX)\n+ )\n+ else:\n+ tables = list(\n+ DataWarehouseTable.objects.filter(team_id=team.pk)\n+ .exclude(deleted=True)\n+ .select_related(\"credential\", \"external_data_source\")\n+ )",
"comment_created_at": "2025-08-14T23:12:28+00:00",
"comment_author": "rafaeelaudibert",
"comment_body": "And, likewise\n\n\n```suggestion\n tables = DataWarehouseTable.objects.filter(team_id=team.pk)\n .exclude(deleted=True)\n .select_related(\"credential\", \"external_data_source\")\n if cache_enabled:\n tables = tables.fetch_cached(team_id=team_id or team.pk, key_prefix=CACHE_KEY_PREFIX)\n```",
"pr_file_module": null
}
]
},
{
"discussion_id": "2269379019",
"pr_number": 35726,
"pr_file": "ee/hogai/graph/insights/nodes.py",
"created_at": "2025-08-12T10:14:51+00:00",
"commented_code": "return \"\n\".join(formatted_insights)\n \n- def _parse_insight_ids(self, response_content: str) -> list[int]:\n- \"\"\"Parse insight IDs from LLM response.\"\"\"\n- import re\n+ def _get_all_loaded_insight_ids(self) -> set[int]:\n+ \"\"\"Get all insight IDs from loaded pages.\"\"\"\n+ all_ids = set()\n+ for page_insights in self._loaded_pages.values():\n+ for insight in page_insights:\n+ all_ids.add(insight.id)\n+ return all_ids\n+\n+ def _find_insight_by_id(self, insight_id: int) -> Insight | None:",
"repo_full_name": "PostHog/posthog",
"discussion_comments": [
{
"comment_id": "2269379019",
"repo_full_name": "PostHog/posthog",
"pr_number": 35726,
"pr_file": "ee/hogai/graph/insights/nodes.py",
"discussion_id": "2269379019",
"commented_code": "@@ -242,62 +307,254 @@ def _format_insights_page(self, page_number: int) -> str:\n \n return \"\\n\".join(formatted_insights)\n \n- def _parse_insight_ids(self, response_content: str) -> list[int]:\n- \"\"\"Parse insight IDs from LLM response.\"\"\"\n- import re\n+ def _get_all_loaded_insight_ids(self) -> set[int]:\n+ \"\"\"Get all insight IDs from loaded pages.\"\"\"\n+ all_ids = set()\n+ for page_insights in self._loaded_pages.values():\n+ for insight in page_insights:\n+ all_ids.add(insight.id)\n+ return all_ids\n+\n+ def _find_insight_by_id(self, insight_id: int) -> Insight | None:",
"comment_created_at": "2025-08-12T10:14:51+00:00",
"comment_author": "sortafreel",
"comment_body": "It seems like O(n2) complexity and is being called multiple times. Would it make sense to cache it somehow, or are the loaded pages too dynamic? Or maybe just build the `self` index dictionary on the go? With thousands of insights it can get a bit expensive, as I see it.",
"pr_file_module": null
},
{
"comment_id": "2273402677",
"repo_full_name": "PostHog/posthog",
"pr_number": 35726,
"pr_file": "ee/hogai/graph/insights/nodes.py",
"discussion_id": "2269379019",
"commented_code": "@@ -242,62 +307,254 @@ def _format_insights_page(self, page_number: int) -> str:\n \n return \"\\n\".join(formatted_insights)\n \n- def _parse_insight_ids(self, response_content: str) -> list[int]:\n- \"\"\"Parse insight IDs from LLM response.\"\"\"\n- import re\n+ def _get_all_loaded_insight_ids(self) -> set[int]:\n+ \"\"\"Get all insight IDs from loaded pages.\"\"\"\n+ all_ids = set()\n+ for page_insights in self._loaded_pages.values():\n+ for insight in page_insights:\n+ all_ids.add(insight.id)\n+ return all_ids\n+\n+ def _find_insight_by_id(self, insight_id: int) -> Insight | None:",
"comment_created_at": "2025-08-13T13:03:30+00:00",
"comment_author": "tatoalo",
"comment_body": "Yeah good point! Added a super simple insight ID caching mechanism.",
"pr_file_module": null
}
]
},
{
"discussion_id": "2277698056",
"pr_number": 36663,
"pr_file": "posthog/hogql_queries/query_runner.py",
"created_at": "2025-08-14T20:50:58+00:00",
"commented_code": "self.__post_init__()\n \n \n-class QueryRunnerWithHogQLContext(QueryRunner):\n+# Type constraint for analytics query responses\n+AR = TypeVar(\"AR\", bound=AnalyticsQueryResponseBase)\n+\n+\n+class AnalyticsQueryRunner(QueryRunner[Q, AR, CR], Generic[Q, AR, CR]):\n+ \"\"\"\n+ QueryRunner subclass that constrains the response type to AnalyticsQueryResponseBase.\n+ \"\"\"\n+\n+ def calculate(self) -> AR:\n+ response = self._calculate()\n+ if not self.modifiers.timings:",
"repo_full_name": "PostHog/posthog",
"discussion_comments": [
{
"comment_id": "2277698056",
"repo_full_name": "PostHog/posthog",
"pr_number": 36663,
"pr_file": "posthog/hogql_queries/query_runner.py",
"discussion_id": "2277698056",
"commented_code": "@@ -1134,7 +1138,23 @@ def apply_dashboard_filters(self, dashboard_filter: DashboardFilter):\n self.__post_init__()\n \n \n-class QueryRunnerWithHogQLContext(QueryRunner):\n+# Type constraint for analytics query responses\n+AR = TypeVar(\"AR\", bound=AnalyticsQueryResponseBase)\n+\n+\n+class AnalyticsQueryRunner(QueryRunner[Q, AR, CR], Generic[Q, AR, CR]):\n+ \"\"\"\n+ QueryRunner subclass that constrains the response type to AnalyticsQueryResponseBase.\n+ \"\"\"\n+\n+ def calculate(self) -> AR:\n+ response = self._calculate()\n+ if not self.modifiers.timings:",
"comment_created_at": "2025-08-14T20:50:58+00:00",
"comment_author": "rafaeelaudibert",
"comment_body": "We could do this differently by updating `HogQLTimings` to be a noop when this isnt set, avoid spending time timing stuff",
"pr_file_module": null
},
{
"comment_id": "2277738469",
"repo_full_name": "PostHog/posthog",
"pr_number": 36663,
"pr_file": "posthog/hogql_queries/query_runner.py",
"discussion_id": "2277698056",
"commented_code": "@@ -1134,7 +1138,23 @@ def apply_dashboard_filters(self, dashboard_filter: DashboardFilter):\n self.__post_init__()\n \n \n-class QueryRunnerWithHogQLContext(QueryRunner):\n+# Type constraint for analytics query responses\n+AR = TypeVar(\"AR\", bound=AnalyticsQueryResponseBase)\n+\n+\n+class AnalyticsQueryRunner(QueryRunner[Q, AR, CR], Generic[Q, AR, CR]):\n+ \"\"\"\n+ QueryRunner subclass that constrains the response type to AnalyticsQueryResponseBase.\n+ \"\"\"\n+\n+ def calculate(self) -> AR:\n+ response = self._calculate()\n+ if not self.modifiers.timings:",
"comment_created_at": "2025-08-14T21:16:19+00:00",
"comment_author": "aspicer",
"comment_body": "This was my original implementation, but @Gilbert09 suggested that it might be confusing and lead to code issues. The timings aren't that computationally intensive, so this PR just removes them at the proper layer (the query runner layer). \r\n\r\nSee discussion here: https://github.com/PostHog/posthog/pull/36600",
"pr_file_module": null
},
{
"comment_id": "2277758429",
"repo_full_name": "PostHog/posthog",
"pr_number": 36663,
"pr_file": "posthog/hogql_queries/query_runner.py",
"discussion_id": "2277698056",
"commented_code": "@@ -1134,7 +1138,23 @@ def apply_dashboard_filters(self, dashboard_filter: DashboardFilter):\n self.__post_init__()\n \n \n-class QueryRunnerWithHogQLContext(QueryRunner):\n+# Type constraint for analytics query responses\n+AR = TypeVar(\"AR\", bound=AnalyticsQueryResponseBase)\n+\n+\n+class AnalyticsQueryRunner(QueryRunner[Q, AR, CR], Generic[Q, AR, CR]):\n+ \"\"\"\n+ QueryRunner subclass that constrains the response type to AnalyticsQueryResponseBase.\n+ \"\"\"\n+\n+ def calculate(self) -> AR:\n+ response = self._calculate()\n+ if not self.modifiers.timings:",
"comment_created_at": "2025-08-14T21:29:57+00:00",
"comment_author": "rafaeelaudibert",
"comment_body": "Thanks for the extra context, I agree with the approach :)",
"pr_file_module": null
}
]
},
{
"discussion_id": "2276175419",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"created_at": "2025-08-14T10:02:41+00:00",
"commented_code": "from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"repo_full_name": "PostHog/posthog",
"discussion_comments": [
{
"comment_id": "2276175419",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T10:02:41+00:00",
"comment_author": "Gilbert09",
"comment_body": "I don't think having an implicit no-op `HogQLTimings` is a good idea - we're just asking for confusion and bugs with this - \"why are timings not working, I'm using `HogQLTimings`\". I think I'd prefer we kept the implementation as-is, but update the `query.py` file to conditionally set timings to `None` instead when `debug` is not True",
"pr_file_module": null
},
{
"comment_id": "2277205963",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T17:00:37+00:00",
"comment_author": "aspicer",
"comment_body": "The issue here is that timings runs across the stack. It is implemented as a hogql modifier, but multiple things touch timings.\r\n\r\nTimings is generally instantiated in the init of query_runner.py\r\n\r\nFrom here, the code usually calls the calculate() or run() or to_query() method on a subclass of query_runner.\r\n\r\nThe query_runner doesn't have a return type bound other than BaseModel. You could theoretically use it for anything. So it doesn't necessarily have a hogql field or a timings field.\r\n\r\nSo the question is where to remove the timings data?\r\n\r\nSince it's a hogql modifier, you could remove the hogql and the timings return value in execute_hogql_query, but then sometimes the actual query_runner one level higher (see actors_query_runner) adds things to it.\r\n\r\nHow can I stop these various query runners from returning timings from their calculate methods without having to make every query runner have logic in it that handles the modifier case of debug?\r\n\r\nWe could do an `if hasattr delattr` thing somewhere in process_query_model but that seems hackier. Open to ideas.",
"pr_file_module": null
},
{
"comment_id": "2277234879",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T17:13:17+00:00",
"comment_author": "Gilbert09",
"comment_body": "To be fair, I probably wouldn't tie this change to the `DEBUG` modifier - it's very non-descript for what actually happens. I'd probably add a timings modifier and base everything off that. \r\n\r\n> So the question is where to remove the timings data?\r\n\r\nHonestly, at the top level seems sensible to me - if that means we have to modify every query runner to handle the case of a missing `timings` object, then that's an appropriate approach imo. Query Runner timings != HogQL timings - we just happen to merge the hogql timings into the query runner timings. Do you want to remove only the hogql timings from the query results, or both hogql and query runner? Because the answer to that should help with where the logic goes",
"pr_file_module": null
},
{
"comment_id": "2277263915",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T17:23:12+00:00",
"comment_author": "aspicer",
"comment_body": "Yes, we don't actually care what happens in HogQL directly from the client side. \r\nWe want to remove timings from query runner calculate calls.\r\n\r\nI guess this could be done by refactoring all query runners that return things that inherit their return from AnalyticsQueryResponseBase and handling it that way without having to make the query runners completely aware of it. I'll look into it. Thanks for the feedback.",
"pr_file_module": null
},
{
"comment_id": "2277420989",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T18:25:29+00:00",
"comment_author": "Gilbert09",
"comment_body": "Thinking about this some more on my commute home. Why don't we just remove timings from the response object at the API layer? This is more of an API concern, right?",
"pr_file_module": null
},
{
"comment_id": "2277458679",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T18:44:44+00:00",
"comment_author": "aspicer",
"comment_body": "That makes sense but two issues:\r\n1. The original reason for doing this is to avoid caching timings, so it has to be at the query_runner level at least\r\n2. The query endpoint allows for any response type, only some of which are analytics queries that have the timings field.\r\n\r\nI think pushing it to the query_runner layer is the right call here, it's relatively straight forward and I'm handling it now. A lot cleaner with no changes to hogql or the timings object. ",
"pr_file_module": null
},
{
"comment_id": "2277460770",
"repo_full_name": "PostHog/posthog",
"pr_number": 36600,
"pr_file": "posthog/hogql/timings.py",
"discussion_id": "2276175419",
"commented_code": "@@ -1,12 +1,36 @@\n from time import perf_counter\n from contextlib import contextmanager\n+from collections.abc import Iterator\n \n from posthog.schema import QueryTiming\n \n \n+class HogQLTimings:\n+ \"\"\"No-op version of HogQLTimings that doesn't collect timing data.\"\"\"",
"comment_created_at": "2025-08-14T18:45:33+00:00",
"comment_author": "Gilbert09",
"comment_body": "Great, thank you!",
"pr_file_module": null
}
]
},
{
"discussion_id": "2245583189",
"pr_number": 35957,
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
"created_at": "2025-07-31T14:45:30+00:00",
"commented_code": "\"interval\": interval,\n }\n \n- activity_attempt = activity_info.attempt\n meter = get_metric_meter(histogram_attributes)\n- hist = meter.create_histogram(\n- name=\"batch_exports_activity_attempt\",\n- description=\"Histogram tracking attempts made by critical batch export activities\",\n+\n+ try:\n+ with ExecutionTimeRecorder(\n+ \"batch_exports_activity_interval_execution_latency\",\n+ description=\"Histogram tracking execution latency for critical batch export activities by interval\",\n+ histogram_attributes=histogram_attributes,\n+ log=False,\n+ ):\n+ result = await super().execute_activity(input)\n+ finally:\n+ attempts_total_counter = meter.create_counter(\n+ name=\"batch_exports_activity_attempts\",\n+ description=\"Counter tracking every attempt at running an activity\",\n+ )\n+ attempts_total_counter.add(1)",
"repo_full_name": "PostHog/posthog",
"discussion_comments": [
{
"comment_id": "2245611026",
"repo_full_name": "PostHog/posthog",
"pr_number": 35957,
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
"discussion_id": "2245583189",
"commented_code": "@@ -94,21 +94,30 @@ async def execute_activity(self, input: ExecuteActivityInput) -> typing.Any:\n \"interval\": interval,\n }\n \n- activity_attempt = activity_info.attempt\n meter = get_metric_meter(histogram_attributes)\n- hist = meter.create_histogram(\n- name=\"batch_exports_activity_attempt\",\n- description=\"Histogram tracking attempts made by critical batch export activities\",\n+\n+ try:\n+ with ExecutionTimeRecorder(\n+ \"batch_exports_activity_interval_execution_latency\",\n+ description=\"Histogram tracking execution latency for critical batch export activities by interval\",\n+ histogram_attributes=histogram_attributes,\n+ log=False,\n+ ):\n+ result = await super().execute_activity(input)\n+ finally:\n+ attempts_total_counter = meter.create_counter(\n+ name=\"batch_exports_activity_attempts\",\n+ description=\"Counter tracking every attempt at running an activity\",\n+ )\n+ attempts_total_counter.add(1)",
"comment_created_at": "2025-07-31T14:45:30+00:00",
"comment_author": "tomasfarias",
"comment_body": "hmm we can cache it if it becomes a problem, otherwise we have to keep it here as we don't have access to the attributes outside of this context.",
"pr_file_module": null
},
{
"comment_id": "2245628459",
"repo_full_name": "PostHog/posthog",
"pr_number": 35957,
"pr_file": "products/batch_exports/backend/temporal/metrics.py",
"discussion_id": "2245583189",
"commented_code": "@@ -94,21 +94,30 @@ async def execute_activity(self, input: ExecuteActivityInput) -> typing.Any:\n \"interval\": interval,\n }\n \n- activity_attempt = activity_info.attempt\n meter = get_metric_meter(histogram_attributes)\n- hist = meter.create_histogram(\n- name=\"batch_exports_activity_attempt\",\n- description=\"Histogram tracking attempts made by critical batch export activities\",\n+\n+ try:\n+ with ExecutionTimeRecorder(\n+ \"batch_exports_activity_interval_execution_latency\",\n+ description=\"Histogram tracking execution latency for critical batch export activities by interval\",\n+ histogram_attributes=histogram_attributes,\n+ log=False,\n+ ):\n+ result = await super().execute_activity(input)\n+ finally:\n+ attempts_total_counter = meter.create_counter(\n+ name=\"batch_exports_activity_attempts\",\n+ description=\"Counter tracking every attempt at running an activity\",\n+ )\n+ attempts_total_counter.add(1)",
"comment_created_at": "2025-07-31T14:52:08+00:00",
"comment_author": "tomasfarias",
"comment_body": "we'll monitor and decide later if this is a problem. `ExecutionTimeCounter` does the same and we haven't noticed impact.",
"pr_file_module": null
}
]
}
]