From 91955b4bcb0e59946ffb230637d6afdd450235ea Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Mar 2024 18:45:28 -0800 Subject: [PATCH 01/10] Fix docs build command in README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index d155a57..c4ed222 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,5 @@ pip install -r docs/requirements-rtd.txt then build the docs by executing the command... ``` -mkdir -p docs/build/html -sphinx-build -M html docs docs/build/html +sphinx-build -M html docs docs/build ``` From fcba83fe25b33276d5c9bb78a61d76c1c5140f09 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Mar 2024 18:47:18 -0800 Subject: [PATCH 02/10] Update NetworkTracker to track time and allow setting of the PID to track --- src/nwb_benchmarks/core/_network_tracker.py | 42 ++++++++++++++------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/src/nwb_benchmarks/core/_network_tracker.py b/src/nwb_benchmarks/core/_network_tracker.py index 381deb9..c49d8bd 100644 --- a/src/nwb_benchmarks/core/_network_tracker.py +++ b/src/nwb_benchmarks/core/_network_tracker.py @@ -12,22 +12,21 @@ @contextlib.contextmanager -def network_activity_tracker(tshark_path: Union[pathlib.Path, None] = None): - """Context manager for tracking network activity and statistics for the code executed in the context""" +def network_activity_tracker(tshark_path: Union[pathlib.Path, None] = None, pid: int = None): + """ + Context manager for tracking network activity and statistics for the code executed in the context + + :param tshark_path: Path to the tshark CLI command to use for tracking network traffic + :param pid: The id of the process to compute the network statistics for. If set to None, then the + PID of the current process will be used. + """ network_tracker = NetworkTracker() try: network_tracker.start_network_capture(tshark_path=tshark_path) - time.sleep(0.3) - - t0 = time.time() yield network_tracker finally: - network_tracker.stop_network_capture() - - t1 = time.time() - network_total_time = t1 - t0 - network_tracker.network_statistics["network_total_time_in_seconds"] = network_total_time + network_tracker.stop_network_capture(pid=pid) class NetworkTracker: @@ -52,11 +51,14 @@ def __init__(self): self.pid_packets = None self.network_statistics = None self.asv_network_statistics = None + self.__start_capture_time = None def start_network_capture(self, tshark_path: Union[pathlib.Path, None] = None): """ Start capturing the connections on this machine as well as all network packets + :param tshark_path: Path to the tshark CLI command to use for tracking network traffic + Side effects: This functions sets the following instance variables: * self.connections_thread * self.network_profile @@ -69,10 +71,16 @@ def start_network_capture(self, tshark_path: Union[pathlib.Path, None] = None): self.network_profiler = NetworkProfiler() self.network_profiler.start_capture(tshark_path=tshark_path) - def stop_network_capture(self): + # start the main timer + self.__start_capture_time = time.time() + + def stop_network_capture(self, pid: int = None): """ Stop capturing network packets and connections. + :param pid: The id of the process to compute the network statistics for. If set to None, then the + PID of the current process (i.e., os.getpid()) will be used. + Note: This function will fail if `start_network_capture` was not called first. Side effects: This functions sets the following instance variables: @@ -81,15 +89,23 @@ def stop_network_capture(self): * self.network_statistics * self.asv_network_statistics """ + # stop capturing the network self.network_profiler.stop_capture() self.connections_thread.stop() - # get the connections for the PID of this process - self.pid_connections = self.connections_thread.get_connections_for_pid(os.getpid()) + # compute the total time + stop_capture_time = time.time() + network_total_time = stop_capture_time - self.__start_capture_time + + # get the connections for the PID of this process or the PID set by the user + if pid is None: + pid = os.getpid() + self.pid_connections = self.connections_thread.get_connections_for_pid(pid) # Parse packets and filter out all the packets for this process pid by matching with the pid_connections self.pid_packets = self.network_profiler.get_packets_for_connections(self.pid_connections) # Compute all the network statistics self.network_statistics = NetworkStatistics.get_statistics(packets=self.pid_packets) + self.network_statistics["network_total_time_in_seconds"] = network_total_time # Very special structure required by ASV # 'samples' is the value tracked in our results From 83fa7bf08ab06558607165f001da41c6a27a9c28 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Mon, 4 Mar 2024 18:48:07 -0800 Subject: [PATCH 03/10] Fix #24 add docs for network tracking --- docs/development.rst | 5 +---- docs/network_tracking.rst | 24 ++++++++++++++++++++ docs/running_benchmarks.rst | 4 +++- docs/writing_benchmarks.rst | 45 ++++++++++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 docs/network_tracking.rst diff --git a/docs/development.rst b/docs/development.rst index 93f48ba..4565ac5 100644 --- a/docs/development.rst +++ b/docs/development.rst @@ -112,7 +112,4 @@ which is also indented for improved human readability and line-by-line GitHub tr If this ``results`` folder eventually becomes too large for Git to reasonably handle, we will explore options to share via other data storage services. -Network Tracking ----------------- - -Stay tuned https://github.com/NeurodataWithoutBorders/nwb_benchmarks/issues/24 +.. include:: network_tracking.rst diff --git a/docs/network_tracking.rst b/docs/network_tracking.rst new file mode 100644 index 0000000..6ec3185 --- /dev/null +++ b/docs/network_tracking.rst @@ -0,0 +1,24 @@ +.. _network-tracking: + +Network Tracking +---------------- + +The network tracking is implemented as part of the `nwb_benchmarks.core` module and consists of the following main components: + +* ``CaptureConnections`` : This class uses the ``psutils`` library to capture network connections and map the connections to process IDs (PIDs). This information is then used downstream to allow filtering of network traffic packets by PID to allow us to distinguish between network traffic generated by us versus other processes running on the same system. See `core/_capture_connections.py `_ +* ``NetworkProfiler`` : This class uses the ``tshark`` command line tool (and ``pyshark`` package) to capture the network traffic (packets) generated by all processes on the system. In combination with `CaptureConnections` we can then filter the captured packets to retrieve the packets generated by a particular PID via the ``get_packets_for_connections`` function. See `core/_network_profiler.py `_ +* ``NetworkStatistics`` : This class provides functions for processing the network packets captured by the ``NetworkProfiler`` to compute basic network statistics, such as, the number of packets send/received or the size of the data up/downloaded. The ``get_statistics`` function provides a convenient method to retrieve all the metrics via a single function call. See `core/_network_statistics.py `_ +* ``NetworkTracker`` and ``network_activity_tracker`` : The ``NetworkTracker`` class, and corresponding ``network_activity_tracker`` context manager, built on the functionality implemented in the above modules to make it easy to track and compute network statistics for a given time during the execution of a code. + +.. note:: + + ``CaptureConnections`` uses `psutil.net_connections() `_, which requires sudo/root access on macOS and AIX. + +.. note:: + + Running the network tracking generates additional threads/processes in order to capture traffic while the main code is running: **1)** ``NetworkProfiler.start_capture`` generates a ``subprocess`` for running the `tshark` command line tool, which is then being terminated when ``NetworkProfiler.stop_capture`` is called and **2)** ``CaptureConnections`` implements a ``Thread`` that is being run in the background. The ``NetworkTracker`` automatically starts and terminates these processs/threads, so a user typically does not need to manage these processes/threads directly. + +Typical usage +^^^^^^^^^^^^^ + +In most cases, users will use the ``NetworkTracker`` or ``network_activity_tracker`` to track network traffic and statistics as illustrated in :ref:`network-tracking-benchmarks`. diff --git a/docs/running_benchmarks.rst b/docs/running_benchmarks.rst index 3c0801b..6914dd1 100644 --- a/docs/running_benchmarks.rst +++ b/docs/running_benchmarks.rst @@ -14,7 +14,9 @@ use `psutil net_connections Date: Tue, 5 Mar 2024 10:04:18 -0800 Subject: [PATCH 04/10] Update docs/network_tracking.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/network_tracking.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/network_tracking.rst b/docs/network_tracking.rst index 6ec3185..0b73a22 100644 --- a/docs/network_tracking.rst +++ b/docs/network_tracking.rst @@ -6,7 +6,7 @@ Network Tracking The network tracking is implemented as part of the `nwb_benchmarks.core` module and consists of the following main components: * ``CaptureConnections`` : This class uses the ``psutils`` library to capture network connections and map the connections to process IDs (PIDs). This information is then used downstream to allow filtering of network traffic packets by PID to allow us to distinguish between network traffic generated by us versus other processes running on the same system. See `core/_capture_connections.py `_ -* ``NetworkProfiler`` : This class uses the ``tshark`` command line tool (and ``pyshark`` package) to capture the network traffic (packets) generated by all processes on the system. In combination with `CaptureConnections` we can then filter the captured packets to retrieve the packets generated by a particular PID via the ``get_packets_for_connections`` function. See `core/_network_profiler.py `_ +* ``NetworkProfiler`` : This class uses the ``tshark`` command line tool (and ``pyshark`` package) to capture the network traffic (packets) generated by all processes on the system. In combination with ``CaptureConnections`` we can then filter the captured packets to retrieve the packets generated by a particular PID via the ``get_packets_for_connections`` function. See `core/_network_profiler.py `_ * ``NetworkStatistics`` : This class provides functions for processing the network packets captured by the ``NetworkProfiler`` to compute basic network statistics, such as, the number of packets send/received or the size of the data up/downloaded. The ``get_statistics`` function provides a convenient method to retrieve all the metrics via a single function call. See `core/_network_statistics.py `_ * ``NetworkTracker`` and ``network_activity_tracker`` : The ``NetworkTracker`` class, and corresponding ``network_activity_tracker`` context manager, built on the functionality implemented in the above modules to make it easy to track and compute network statistics for a given time during the execution of a code. From d0efc2a3617ab63e3129eafe5f6ec5a54776a0cf Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:04:29 -0800 Subject: [PATCH 05/10] Update docs/network_tracking.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/network_tracking.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/network_tracking.rst b/docs/network_tracking.rst index 0b73a22..fd6625a 100644 --- a/docs/network_tracking.rst +++ b/docs/network_tracking.rst @@ -7,7 +7,7 @@ The network tracking is implemented as part of the `nwb_benchmarks.core` module * ``CaptureConnections`` : This class uses the ``psutils`` library to capture network connections and map the connections to process IDs (PIDs). This information is then used downstream to allow filtering of network traffic packets by PID to allow us to distinguish between network traffic generated by us versus other processes running on the same system. See `core/_capture_connections.py `_ * ``NetworkProfiler`` : This class uses the ``tshark`` command line tool (and ``pyshark`` package) to capture the network traffic (packets) generated by all processes on the system. In combination with ``CaptureConnections`` we can then filter the captured packets to retrieve the packets generated by a particular PID via the ``get_packets_for_connections`` function. See `core/_network_profiler.py `_ -* ``NetworkStatistics`` : This class provides functions for processing the network packets captured by the ``NetworkProfiler`` to compute basic network statistics, such as, the number of packets send/received or the size of the data up/downloaded. The ``get_statistics`` function provides a convenient method to retrieve all the metrics via a single function call. See `core/_network_statistics.py `_ +* ``NetworkStatistics`` : This class provides functions for processing the network packets captured by the ``NetworkProfiler`` to compute basic network statistics, such as, the number of packets sent/received or the size of the data up/downloaded. The ``get_statistics`` function provides a convenient method to retrieve all the metrics via a single function call. See `core/_network_statistics.py `_ * ``NetworkTracker`` and ``network_activity_tracker`` : The ``NetworkTracker`` class, and corresponding ``network_activity_tracker`` context manager, built on the functionality implemented in the above modules to make it easy to track and compute network statistics for a given time during the execution of a code. .. note:: From 59468e3c5ce99c0d8430e357a8aec42268ab12e8 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:04:53 -0800 Subject: [PATCH 06/10] Update docs/network_tracking.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/network_tracking.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/network_tracking.rst b/docs/network_tracking.rst index fd6625a..9029de3 100644 --- a/docs/network_tracking.rst +++ b/docs/network_tracking.rst @@ -16,7 +16,7 @@ The network tracking is implemented as part of the `nwb_benchmarks.core` module .. note:: - Running the network tracking generates additional threads/processes in order to capture traffic while the main code is running: **1)** ``NetworkProfiler.start_capture`` generates a ``subprocess`` for running the `tshark` command line tool, which is then being terminated when ``NetworkProfiler.stop_capture`` is called and **2)** ``CaptureConnections`` implements a ``Thread`` that is being run in the background. The ``NetworkTracker`` automatically starts and terminates these processs/threads, so a user typically does not need to manage these processes/threads directly. + Running the network tracking generates additional threads/processes in order to capture traffic while the main code is running: **1)** ``NetworkProfiler.start_capture`` generates a ``subprocess`` for running the ``tshark`` command line tool, which is then being terminated when ``NetworkProfiler.stop_capture`` is called and **2)** ``CaptureConnections`` implements a ``Thread`` that is being run in the background. The ``NetworkTracker`` automatically starts and terminates these processs/threads, so a user typically does not need to manage these directly. Typical usage ^^^^^^^^^^^^^ From de7d7e3f5d9a73fd07f3c46c864cc6e29e48a9ba Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:05:04 -0800 Subject: [PATCH 07/10] Update docs/writing_benchmarks.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/writing_benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/writing_benchmarks.rst b/docs/writing_benchmarks.rst index 33c1ebd..781971c 100644 --- a/docs/writing_benchmarks.rst +++ b/docs/writing_benchmarks.rst @@ -125,7 +125,7 @@ and so we managed to save ~5 lines of code for every occurrence of this logic in Writing a network tracking benchmark ------------------------------------ -Functions that require network access ---such as reading a file from S3--- are often a black box, with functions in other libraries (e.g., `h5py`, `fsspec` etc.) managing the access to the remote resources. The runtime performance of such functions is often inherently driven by how these functions utilize the network to access the resources. It is, hence, important that we can profile the network traffic that is being generated to better understand, e.g., the amount of data that is being downloaded and uploaded, the number of requests that are being sent/received, and others. +Functions that require network access ---such as reading a file from S3--- are often a black box, with functions in other libraries (e.g., ``h5py``, ``fsspec``, etc.) managing the access to the remote resources. The runtime performance of such functions is often inherently driven by how these functions utilize the network to access the resources. It is, hence, important that we can profile the network traffic that is being generated to better understand, e.g., the amount of data that is being downloaded and uploaded, the number of requests that are being sent/received, and others. To simplify the implementation of benchmarks for tracking network statistics, we implemented in the `nwb_benchmarks.core` module various helper classes and functions. The network tracking functionality is designed to track the network traffic generated by the main Python process that our tests are running during a user-defined period of time. The `network_activity_tracker` context manager can be used to track the network traffic generated by the code within the context. A basic network benchmark, then looks as follows: From f433d9babf1305b7e24375a63202116b223534e1 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:05:16 -0800 Subject: [PATCH 08/10] Update docs/writing_benchmarks.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/writing_benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/writing_benchmarks.rst b/docs/writing_benchmarks.rst index 781971c..ae0628f 100644 --- a/docs/writing_benchmarks.rst +++ b/docs/writing_benchmarks.rst @@ -127,7 +127,7 @@ Writing a network tracking benchmark Functions that require network access ---such as reading a file from S3--- are often a black box, with functions in other libraries (e.g., ``h5py``, ``fsspec``, etc.) managing the access to the remote resources. The runtime performance of such functions is often inherently driven by how these functions utilize the network to access the resources. It is, hence, important that we can profile the network traffic that is being generated to better understand, e.g., the amount of data that is being downloaded and uploaded, the number of requests that are being sent/received, and others. -To simplify the implementation of benchmarks for tracking network statistics, we implemented in the `nwb_benchmarks.core` module various helper classes and functions. The network tracking functionality is designed to track the network traffic generated by the main Python process that our tests are running during a user-defined period of time. The `network_activity_tracker` context manager can be used to track the network traffic generated by the code within the context. A basic network benchmark, then looks as follows: +To simplify the implementation of benchmarks for tracking network statistics, we implemented in the ``nwb_benchmarks.core`` module various helper classes and functions. The network tracking functionality is designed to track the network traffic generated by the main Python process that our tests are running during a user-defined period of time. The ``network_activity_tracker`` context manager can be used to track the network traffic generated by the code within the context. A basic network benchmark, then looks as follows: .. code-block:: python From b5c385a7faa1e5bd4e358a1a736087f92feaabb7 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:05:41 -0800 Subject: [PATCH 09/10] Update docs/writing_benchmarks.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/writing_benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/writing_benchmarks.rst b/docs/writing_benchmarks.rst index ae0628f..f4efccb 100644 --- a/docs/writing_benchmarks.rst +++ b/docs/writing_benchmarks.rst @@ -142,7 +142,7 @@ To simplify the implementation of benchmarks for tracking network statistics, we x = requests.get('https://nwb-benchmarks.readthedocs.io/en/latest/setup.html') return network_tracker.asv_network_statistics -In cases where a context manager may not be sufficient, we can alternatively use the `NetworkTracker` class directly to explicitly control when to start and stop the tracking. +In cases where a context manager may not be sufficient, we can alternatively use the ``NetworkTracker`` class directly to explicitly control when to start and stop the tracking. .. code-block:: python From 7ea98c2e838efc5078fe07a39e0ee85fa6218ed0 Mon Sep 17 00:00:00 2001 From: Oliver Ruebel Date: Tue, 5 Mar 2024 10:05:54 -0800 Subject: [PATCH 10/10] Update docs/writing_benchmarks.rst Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- docs/writing_benchmarks.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/writing_benchmarks.rst b/docs/writing_benchmarks.rst index f4efccb..2c14179 100644 --- a/docs/writing_benchmarks.rst +++ b/docs/writing_benchmarks.rst @@ -159,4 +159,4 @@ In cases where a context manager may not be sufficient, we can alternatively use tracker.stop_network_capture() return tracker.asv_network_statistics -By default, the `NetworkTracker` and `network_activity_tracker` track the network activity of the current process ID (i.e., ``os.getpid()``), but the PID to track can also be set explicitly if a different process needs to be monitored. +By default, the ``NetworkTracker`` and ``network_activity_tracker`` track the network activity of the current process ID (i.e., ``os.getpid()``), but the PID to track can also be set explicitly if a different process needs to be monitored.