diff --git a/docs/source/polars-cloud/run/query-profile.md b/docs/source/polars-cloud/run/query-profile.md index a2d57ce8b5b4..3ccf65180f7b 100644 --- a/docs/source/polars-cloud/run/query-profile.md +++ b/docs/source/polars-cloud/run/query-profile.md @@ -7,58 +7,115 @@ spent during execution. This visibility helps you optimize complex queries and better understand the distributed execution of queries. -
+
Example query and dataset You can copy and paste the example below to explore the feature yourself. Don't forget to change the workspace name to one of your own workspaces. -```python -import polars as pl -import polars_cloud as pc - -pc.authenticate() - -ctx = pc.ComputeContext(workspace="your-workspace", cpus=12, memory=12, cluster_size=4) - -def pdsh_q3(customer, lineitem, orders): - return ( - customer.filter(pl.col("c_mktsegment") == "BUILDING") - .join(orders, left_on="c_custkey", right_on="o_custkey") - .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") - .filter(pl.col("o_orderdate") < pl.date(1995, 3, 15)) - .filter(pl.col("l_shipdate") > pl.date(1995, 3, 15)) - .with_columns( - (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias("revenue") - ) - .group_by("o_orderkey", "o_orderdate", "o_shippriority") - .agg(pl.sum("revenue")) - .select( - pl.col("o_orderkey").alias("l_orderkey"), - "revenue", - "o_orderdate", - "o_shippriority", - ) - .sort(by=["revenue", "o_orderdate"], descending=[True, False]) - ) - -lineitem = pl.scan_parquet( - "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/lineitem/*.parquet", - storage_options={"request_payer": "true"}, -) -customer = pl.scan_parquet( - "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/customer/*.parquet", - storage_options={"request_payer": "true"}, -) -orders = pl.scan_parquet( - "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/orders/*.parquet", - storage_options={"request_payer": "true"}, -) -``` +{{code_block('polars-cloud/query-profile','query',[])}}
-{{code_block('polars-cloud/query-profile','execute',[])}} +## Polars Cloud Query Profiler + +Polars Cloud has a built-in query profiler. It shows realtime status of the query during and after +execution, and gives you detailed metrics to the node level. This can help you find and analyze +bottlenecks, helping you to run your queries optimally. + +The query profiler can be accessed from the cluster dashboard. + +### Cluster Dashboard + +The cluster dashboard gives you: + +- insights into system metrics (CPU, memory, and network) of all nodes on your cluster. +- an overview of the queries that are related to this cluster: scheduled, running, and finished. + +![Cluster Dashboard](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/cluster_dashboard.png) + +You can get into the cluster dashboard through the pop-ups on the Polars Cloud dashboard after +starting a compute cluster, or by going to the details page of your compute. + +![Compute Details page](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/compute_dashboard.png) + +This dashboard runs from the compute that you're running your queries on. It becomes available the +moment your compute has started and is no longer available after your cluster shuts down. + +The system resources allow you to find bottlenecks and tweak your cluster configuration accordingly. + +- In case of CPU or memory bottlenecks, you can scale vertically, adding more vCPUs and RAM. +- In case your network bandwidth maxes out, you can scale horizontally, by adding more nodes. +- In case none of the resources seems to be a bottleneck, you might have overprovisioned your + cluster. In this case you can scale it down until you notice higher resource usage, reducing cost. +- Clusters that are underprovisioned but balanced can get the job done, but by scaling diagonally + (same instance type, but more nodes) you could still reduce runtime. + +There is no one size fits all solution. This is different for every dataset, query, and use case. +These visualisations help you figure out what fits for you. + +### Query Details + +When you select a query from the cluster dashboard you open the details. An overview opens that +displays the general metrics of that query. + +![Query Details](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/query_details.png) + +From here you can dive deeper into different aspects of the query. The first one we'll explore is +the logical plan. + +### Logical Plan + +In Polars, a logical plan is the intermediate representation (IR) of a query that describes what +operations are performed, and in which order, before physical execution details are decided. This +shows the graph that is a representation of the query you sent to Polars Cloud. You can think of the +logical plan as the `EXPLAIN` in SQL. + +![Logical Plan](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/logical_plan.png) + +The logical plan lets you verify that Polars interprets your query correctly. You could, for +example, check if filters are applied before joins, or if the expected columns are selected. It is +also useful for understanding how query optimization has transformed your original query, which you +defined in Polars' DSL. + +### Stage Graph + +The stage graph represents the different that will be executed on a distributed query. Note that a +single node query doesn't have a notion of stages. + +From the overview with the stage graph you can click one of the operations in any stage to open up +its details. + +![Stage graph node details](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/stage_graph_node_details.png) + +In it you can see details of the operation in that node. + +Alternatively, you can click the stage itself, opening the stage graph details. + +![Stage graph details](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/stage_graph_stage_details.png) + +You can see general performance metrics of this stage, such as the amount of workers involved, the +output rows, the bytes read and written, and the timeline of the stage. + +### Physical Plan + +The physical plan shows how the query was executed by the engine. You can get to this screen by +clicking `View physical plan` in the Stage details, for a distributed query, or you'll go straight +here from the query details for a single node query. + +![Physical plan](https://raw.githubusercontent.com/pola-rs/polars-static/refs/heads/master/docs/query-profiler/physical_plan.png) + +In it you can find the time spent per node, identifying choke points. Additionally some nodes are +marked with warnings when they're potentially memory intensive, or executed on a single node. In the +details pane you can find specific metrics on how many rows went in and out, what the morsel sizes +were and how many went through, and more. + +## Profile with the Polars Cloud SDK + +Besides the query profiler in the cluster dashboard, you can also get diagnostic information through +the Polars Cloud SDK. + +### `QueryProfile` and `QueryResult` The `await_profile` method can be used to monitor an in-progress query. It returns a QueryProfile object containing a DataFrame with information about which stages are being processed across @@ -105,7 +162,7 @@ As each worker starts and completes each stage of the query, it notifies the lea `await_profile` method will poll the lead worker until there is an update from any worker, and then return the full profile data of the query. -The QueryProfile object also has a summary property to return an aggregated view of each stage. +The `QueryProfile` object also has a `summary` property to return an aggregated view of each stage. {{code_block('polars-cloud/query-profile','await_summary',[])}} @@ -129,3 +186,38 @@ shape: (13, 6) │ 7 ┆ Execute IR ┆ true ┆ i-xxx ┆ 356662µs ┆ 1131041 ┆ 289546496 ┆ 0 │ └──────────────┴──────────────┴───────────┴────────────┴──────────────┴─────────────┴───────────────────────┴────────────────────┘ ``` + +### Plan + +`QueryProfile` also exposes `.plan()` to retrieve the physical plan as a string, and `.graph()` to +render it as a visual diagram. See [Explain](#explain) below for details. + +Use `.plan()` to retrieve the executed query plan as a string. This is useful for understanding +exactly how Polars executed your query, including the physical stages and operations performed +across the cluster. + +{{code_block('polars-cloud/query-profile','explain',['QueryResult'])}} + +```text +# TODO: add example output +``` + +You can also retrieve the optimized intermediate representation (IR) of the query before execution +by passing `"ir"` as the plan type. + +{{code_block('polars-cloud/query-profile','explain_ir',['QueryResult'])}} + +```text +# TODO: add example output +``` + +```Graph +Both `plan()` and `graph()` are available on `QueryResult` (with `plan_type` set to `"physical"` or +`"ir"`) and on `QueryProfile` (physical plan only). These methods are only available in direct mode. + +Use `.graph()` to render the plan as a visual dot diagram using matplotlib. + +{{code_block('polars-cloud/query-profile','graph',['QueryResult'])}} + + +``` diff --git a/docs/source/src/python/polars-cloud/query-profile.py b/docs/source/src/python/polars-cloud/query-profile.py index d6377c9ebc28..71c1c2117cda 100644 --- a/docs/source/src/python/polars-cloud/query-profile.py +++ b/docs/source/src/python/polars-cloud/query-profile.py @@ -1,7 +1,48 @@ """ -# --8<-- [start:execute] +# --8<-- [start:query] +import polars as pl +import polars_cloud as pc + +pc.authenticate() + +ctx = pc.ComputeContext(workspace="your-workspace", cpus=12, memory=12, cluster_size=4) + +def pdsh_q3(customer, lineitem, orders): + return ( + customer.filter(pl.col("c_mktsegment") == "BUILDING") + .join(orders, left_on="c_custkey", right_on="o_custkey") + .join(lineitem, left_on="o_orderkey", right_on="l_orderkey") + .filter(pl.col("o_orderdate") < pl.date(1995, 3, 15)) + .filter(pl.col("l_shipdate") > pl.date(1995, 3, 15)) + .with_columns( + (pl.col("l_extendedprice") * (1 - pl.col("l_discount"))).alias("revenue") + ) + .group_by("o_orderkey", "o_orderdate", "o_shippriority") + .agg(pl.sum("revenue")) + .select( + pl.col("o_orderkey").alias("l_orderkey"), + "revenue", + "o_orderdate", + "o_shippriority", + ) + .sort(by=["revenue", "o_orderdate"], descending=[True, False]) + ) + +lineitem = pl.scan_parquet( + "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/lineitem/*.parquet", + storage_options={"request_payer": "true"}, +) +customer = pl.scan_parquet( + "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/customer/*.parquet", + storage_options={"request_payer": "true"}, +) +orders = pl.scan_parquet( + "s3://polars-cloud-samples-us-east-2-prd/pdsh/sf100/orders/*.parquet", + storage_options={"request_payer": "true"}, +) + result = pdsh_q3(customer, lineitem, orders).remote(ctx).distributed().execute() -# --8<-- [end:execute] +# --8<-- [end:query] # --8<-- [start:await_profile] result.await_profile().data @@ -10,4 +51,16 @@ # --8<-- [start:await_summary] result.await_profile().summary # --8<-- [end:await_summary] + +# --8<-- [start:explain] +result.await_result().plan() +# --8<-- [end:explain] + +# --8<-- [start:explain_ir] +result.await_result().plan("ir") +# --8<-- [end:explain_ir] + +# --8<-- [start:graph] +result.await_result().graph() +# --8<-- [end:graph] """