From 3fe00646d67040921edbb27f7b21ced6f2d49020 Mon Sep 17 00:00:00 2001 From: Stefano Ottolenghi Date: Mon, 10 Jun 2024 10:00:44 +0200 Subject: [PATCH] . --- .../modules/ROOT/pages/performance.adoc | 89 +++++++++++-------- 1 file changed, 51 insertions(+), 38 deletions(-) diff --git a/python-manual/modules/ROOT/pages/performance.adoc b/python-manual/modules/ROOT/pages/performance.adoc index c0730f93..178c4c79 100644 --- a/python-manual/modules/ROOT/pages/performance.adoc +++ b/python-manual/modules/ROOT/pages/performance.adoc @@ -67,26 +67,27 @@ The Neo4j server can retrieve records in batches, and the driver can receive one Lazy-loading a result spreads out network traffic and memory usage. For convenience, xref:query-simple.adoc[`.execute_query()`] always retrieves all result records at once (it is what the `Eager` in `EagerResult` stands for). -To lazy-load a result, you have to use xref:transactions.adoc#managed-transactions[`.execute_read/write()`] (or other forms of manually-handled transactions) and *not* cast the `Result` object to `list` when processing the result; iterate on it instead. +To lazy-load a result, you have to use xref:transactions.adoc#managed-transactions[`.execute_read/write()`] (or other forms of manually-handled xref:transactions.adoc[transactions]) and *not* cast the `Result` object to `list` when processing the result; iterate on it instead. .Comparison between eager and lazy loading ==== -Consider a query that results in 4500 result records, and that the driver's link:https://neo4j.com/docs/api/python-driver/current/api.html#fetch-size-ref[batch size] is set to 1000 (default). +Consider a query that results in 2500 result records, and that the driver's link:https://neo4j.com/docs/api/python-driver/current/api.html#fetch-size-ref[batch size] is set to 1000 (default). [cols="1a,1a", options="header"] |=== |Eager loading |Lazy loading | -- The server has to read all 4500 records from the storage before it can send even the first one the driver (i.e. it takes more time for the client to receive the first record). -- Before any record is available to the application, the driver has to receive all 4500 records. -- The client has to hold in memory all of 4500 records. +- The server has to read all 2500 records from the storage before it can send even the first one the driver (i.e. it takes more time for the client to receive the first record). +- Before any record is available to the application, the driver has to receive all 2500 records. +- The client has to hold in memory all of 2500 records. | - The server reads the first 1000 records and sends them to the driver. - The application can process records as soon as the first batch of 1000 is transferred. - When the first batch has been processed, the server reads another batch and delivers it to the driver. -- Waiting time and resource consumption (both client- and server-side) for the remaining 2500 records is deferred to when the application requests more records, which are delivered in 3 more batches. +Further records are delivered in further batches. +- Waiting time and resource consumption (both client- and server-side) for the remaining records is deferred to when the application requests more records. - Resource consumption is bounded by at most 1000 records. |=== @@ -103,16 +104,15 @@ import tracemalloc URI = "" AUTH = ("", "") -# Returns 20 records, each with properties +# Returns 2500 records, each with properties # - `output` (an expensive computation, to slow down retrieval) # - `dummyData` (a list of 10000 ints, about 8 KB). slow_query = ''' -UNWIND range(1, 20) AS s +UNWIND range(1, 2500) AS s RETURN reduce(s=s, x in range(1,1000000) | s + sin(toFloat(x))+cos(toFloat(x))) AS output, range(1, 10000) AS dummyData ''' - -# Delay for each processed record, proxy for some expensive processing. +# Delay for each processed record sleep_time = 0.5 @@ -121,42 +121,42 @@ def main(): driver.verify_connectivity() start_time = time() - log('BATCHING (execute_read)') + log('LAZY LOADING (execute_read)') tracemalloc.start() - batching(driver) + lazy_loading(driver) log(f'Peak memory usage: {tracemalloc.get_traced_memory()[1]} bytes') tracemalloc.stop() log("--- %s seconds ---" % (time() - start_time)) start_time = time() - log('NO BATCHING (execute_query)') + log('EAGER LOADING (execute_query)') tracemalloc.start() - nobatching(driver) + eager_loading(driver) log(f'Peak memory usage: {tracemalloc.get_traced_memory()[1]} bytes') tracemalloc.stop() log("--- %s seconds ---" % (time() - start_time)) -def batching(driver): +def lazy_loading(driver): def process_records(tx): log('Submit query') result = tx.run(slow_query) for record in result: - log(f'Processing record {record.get("output")}') + log(f'Processing record {int(record.get("output"))}') sleep(sleep_time) # proxy for some expensive operation with driver.session(database='neo4j') as session: processed_result = session.execute_read(process_records) -def nobatching(driver): +def eager_loading(driver): log('Submit query') records, _, _ = driver.execute_query(slow_query, database_='neo4j') for record in records: - log(f'Processing record {record.get("output")}') + log(f'Processing record {int(record.get("output"))}') sleep(sleep_time) # proxy for some expensive operation @@ -171,30 +171,43 @@ if __name__ == '__main__': .Output [source, output, role=nocollapse] ---- -[1717057433.14] LAZY LOADING (execute_read) -[1717057433.14] Submit query -[1717057433.24] Processing record 0.5309371354666308 // <1> -[1717057433.74] Processing record 1.5309371354662915 -[1717057434.25] Processing record 2.5309371354663197 +[1718001717.79] LAZY LOADING (execute_read) +[1718001717.8] Submit query +[1718001783.53] Processing record 0 // <1> +[1718001784.03] Processing record 1 +[1718001784.53] Processing record 2 +... +[1718002283.79] Processing record 998 +[1718002284.29] Processing record 999 +[1718002350.55] Processing record 1000 // <2> +[1718002351.05] Processing record 1001 +... +[1718003149.51] Processing record 2498 +[1718003150.01] Processing record 2499 +[1718003150.52] Peak memory usage: 68566 bytes +[1718003150.52] --- 1432.7221286296844 seconds --- + +[1718003150.52] EAGER LOADING (execute_query) +[1718003150.52] Submit query +[1718003390.99] Processing record 0 // <3> +[1718003391.49] Processing record 1 ... -[1717057442.88] Processing record 19.530937135463947 -[1717057443.38] Peak memory usage: 768642 bytes -[1717057443.38] --- 10.248241662979126 seconds --- - -[1717057443.38] EAGER LOADING (execute_query) -[1717057443.38] Submit query -[1717057445.31] Processing record 0.5309371354666308 // <2> -[1717057445.81] Processing record 1.5309371354662915 -[1717057446.31] Processing record 2.5309371354663197 +[1718003890.78] Processing record 998 +[1718003891.28] Processing record 999 +[1718003891.78] Processing record 1000 // <4> +[1718003892.28] Processing record 1001 ... -[1717057454.82] Processing record 19.530937135463947 -[1717057455.34] Peak memory usage: 7081123 bytes // <3> -[1717057455.34] --- 11.960006713867188 seconds --- +[1718004641.97] Processing record 2498 +[1718004642.47] Processing record 2499 +[1718004642.98] Peak memory usage: 673940 bytes // <5> +[1718004642.98] --- 1492.4611067771912 seconds --- ---- -<1> In lazy loading, the first record is processed with a negligible delay after the query is submitted. The driver has to wait only for the server to execute the query -<2> l -<3> l +<1> With lazy loading, the first record is available ~66 seconds after the query is submitted (i.e. as soon as the server has retrieved the first batch of 1000 records). +<2> It takes about the same time to receive the second batch as the first batch (similar for subsequent batches). +<3> With eager loading, the first record is available ~240 seconds after the query is submitted (i.e. after the server has retrieved all 2500 records). +<4> There's no delay in between batches: the processing time between any two records is the same. +<5> Memory usage is larger with eager loading than with lazy loading, because the application materializes a list of 2500 records (while in lazy loading it's never more than 1000). ====