Skip to content

Commit

Permalink
Added infrastructure to run local Jupyter Lab instance and also added…
Browse files Browse the repository at this point in the history
… pre-commit configuration
  • Loading branch information
astrofrog committed Nov 13, 2024
1 parent 404fe60 commit 7aed9be
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 54 deletions.
15 changes: 15 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates

version: 2
updates:
- package-ecosystem: "github-actions" # See documentation for possible values
directory: ".github/workflows" # Location of package manifests
schedule:
interval: "weekly"
groups:
actions:
patterns:
- "*"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ __pycache__
dist
build
.ipynb_checkpoints
__pycache__
29 changes: 29 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
ci:
autofix_prs: false
autoupdate_schedule: 'monthly'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-added-large-files
args: ["--enforce-all", "--maxkb=300"]
- id: check-case-conflict
- id: check-json
- id: check-merge-conflict
- id: check-symlinks
- id: check-toml
- id: check-xml
- id: check-yaml
exclude: ".*(.github.*)$"
- id: detect-private-key
- id: end-of-file-fixer
exclude: ".*(data.*|extern.*|licenses.*|_static.*|_parsetab.py)$"
- id: trailing-whitespace

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.3.4"
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format
2 changes: 2 additions & 0 deletions jupyter_output_monitor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
from ._monitor import monitor
from ._version import __version__

__all__ = ["monitor", "__version__"]
145 changes: 95 additions & 50 deletions jupyter_output_monitor/_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,81 +4,114 @@

import os
import sys
import tempfile
import time
import click
import datetime
from io import BytesIO

import numpy as np
import click
from PIL import Image
from playwright.sync_api import sync_playwright
from io import BytesIO

from ._server import jupyter_server
from ._utils import clear_notebook, isotime

RG_SPECIAL = (143, 56)

def isotime():
return datetime.datetime.now().isoformat()

@click.command()
@click.argument('url')
@click.option('--output', default=None, help='Output directory - if not specified, this defaults to output_<timestamp>')
@click.option('--wait-after-execute', default=10, help='Time in s to wait after executing each cell')
@click.option('--headless', is_flag=True, help='Whether to run in headless mode')
def monitor(url, output, wait_after_execute, headless):

@click.option(
"--notebook",
default=None,
help="The notebook to profile. If specified a local Jupyter Lab instance will be run",
)
@click.option(
"--url",
default=None,
help="The URL hosting the notebook to profile, including any token and notebook path.",
)
@click.option(
"--output",
default=None,
help="Output directory - if not specified, this defaults to output_<timestamp>",
)
@click.option(
"--wait-after-execute",
default=10,
help="Time in s to wait after executing each cell",
)
@click.option("--headless", is_flag=True, help="Whether to run in headless mode")
def monitor(notebook, url, output, wait_after_execute, headless):
if output is None:
output = f'output-{isotime()}'
output = f"output-{isotime()}"

if os.path.exists(output):
print('Output directory {output} already exists')
print("Output directory {output} already exists")
sys.exit(1)

os.makedirs(output)

if notebook is None and url is None:
print("Either --notebook or --url should be specified")
sys.exit(1)
elif notebook is not None and url is not None:
print("Only one of --notebook or --url should be specified")
sys.exit(1)
elif notebook is not None:
# Create a temporary directory with a clean version of the notebook
notebook_dir = tempfile.mkdtemp()
clear_notebook(notebook, os.path.join(notebook_dir, "notebook.ipynb"))
with jupyter_server(notebook_dir) as server:
url = server.base_url + "/lab/tree/notebook.ipynb"
_monitor_output(url, output, wait_after_execute, headless)
else:
_monitor_output(url, output, wait_after_execute, headless)


def _monitor_output(url, output, wait_after_execute, headless):
# Index of the current last screenshot, by output index
last_screenshot = {}

with sync_playwright() as p, open(os.path.join(output, 'event_log.csv'), 'w') as log:

log.write('time,event,index,screenshot\n')
with (
sync_playwright() as p,
open(os.path.join(output, "event_log.csv"), "w") as log,
):
log.write("time,event,index,screenshot\n")
log.flush()

# Launch browser and open URL

browser = p.firefox.launch(headless=headless)
page = browser.new_page(viewport={'width':2000, 'height':10000})
page = browser.new_page(viewport={"width": 2000, "height": 10000})
page.goto(url)

while True:

print('Checking for input cells')
print("Checking for input cells")

# Construct list of input and output cells in the notebook
input_cells = list(page.query_selector_all('.jp-InputArea-editor'))
input_cells = list(page.query_selector_all(".jp-InputArea-editor"))

# Keep only input cells that are visible
input_cells = [cell for cell in input_cells if cell.is_visible()]

if len(input_cells) > 0:
break

print('-> No input cells found, waiting before checking again')
print("-> No input cells found, waiting before checking again")

# If no visible input cells, wait and try again
page.wait_for_timeout(1000)

print(f'{len(input_cells)} input cells found')
print(f"{len(input_cells)} input cells found")

last_screenshot = {}

# Now loop over each input cell and execute
for input_index, input_cell in enumerate(input_cells):

if input_cell.text_content().strip() == '':
print(f'Skipping empty input cell {input_index}')
if input_cell.text_content().strip() == "":
print(f"Skipping empty input cell {input_index}")
continue

print(f'Execute input cell {input_index}')
print(f"Execute input cell {input_index}")

# Take screenshot before we start executing cell but save it after
screenshot_bytes = input_cell.screenshot()
Expand All @@ -87,48 +120,51 @@ def monitor(url, output, wait_after_execute, headless):
input_cell.click()

# Execute it
page.keyboard.press('Shift+Enter')
page.keyboard.press("Shift+Enter")

timestamp = isotime()

screenshot_filename = os.path.join(output, f'input-{input_index:03d}-{timestamp}.png')
screenshot_filename = os.path.join(
output,
f"input-{input_index:03d}-{timestamp}.png",
)
image = Image.open(BytesIO(screenshot_bytes))
image.save(screenshot_filename)

log.write(f'{timestamp},execute-input,{input_index},{screenshot_filename}\n')
log.write(
f"{timestamp},execute-input,{input_index},{screenshot_filename}\n",
)

# Now loop and check for changes in any of the output cells - if a cell
# output changes, save a screenshot

print('Watching for changes in output cells')
print("Watching for changes in output cells")

start = time.time()
while time.time() - start < wait_after_execute:

output_cells = list(page.query_selector_all('.jp-OutputArea-output'))
output_cells = list(page.query_selector_all(".jp-OutputArea-output"))

for output_cell in output_cells:

if not output_cell.is_visible():
continue

# The element we are interested in is one level down

div = output_cell.query_selector('div')
div = output_cell.query_selector("div")

if div is None:
continue

style = div.get_attribute('style')
style = div.get_attribute("style")

if style is None or 'border-color: rgb(' not in style:
if style is None or "border-color: rgb(" not in style:
continue

# Parse rgb values for border
start_pos = style.index('border-color:')
start_pos = style.index('(', start_pos) + 1
end_pos = style.index(')', start_pos)
r, g, b = [int(x) for x in style[start_pos:end_pos].split(',')]
start_pos = style.index("border-color:")
start_pos = style.index("(", start_pos) + 1
end_pos = style.index(")", start_pos)
r, g, b = (int(x) for x in style[start_pos:end_pos].split(","))

# The (r,g) pair is chosen to be random and unlikely to
# happen by chance on the page. If this values don't match, we
Expand All @@ -142,30 +178,39 @@ def monitor(url, output, wait_after_execute, headless):
# which should be sufficient
output_index = b

print(f'- taking screenshot of output cell {output_index}')
print(f"- taking screenshot of output cell {output_index}")

screenshot_bytes = div.screenshot()

# If screenshot didn't exist before for this cell or if it has
# changed, we save it to a file and keep track of it.
if output_index not in last_screenshot or last_screenshot[output_index] != screenshot_bytes:

print(f' -> change detected!')
if (
output_index not in last_screenshot
or last_screenshot[output_index] != screenshot_bytes
):
print(" -> change detected!")

timestamp = isotime()
screenshot_filename = os.path.join(output, f'output-{output_index:03d}-{timestamp}.png')
screenshot_filename = os.path.join(
output,
f"output-{output_index:03d}-{timestamp}.png",
)
image = Image.open(BytesIO(screenshot_bytes))
image.save(screenshot_filename)

log.write(f'{timestamp},output-changed,{output_index},{screenshot_filename}\n')
log.write(
f"{timestamp},output-changed,{output_index},{screenshot_filename}\n",
)
log.flush()

print(f"Saving screenshot of output {output_index} at {timestamp}")
print(
f"Saving screenshot of output {output_index} at {timestamp}",
)

last_screenshot[output_index] = screenshot_bytes

print('Stopping monitoring output and moving on to next input cell')
print("Stopping monitoring output and moving on to next input cell")


if __name__ == '__main__':
if __name__ == "__main__":
monitor()
20 changes: 20 additions & 0 deletions jupyter_output_monitor/_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from contextlib import contextmanager

from solara.test.pytest_plugin import (
ServerJupyter,
)

from ._utils import get_free_port

__all__ = ["jupyter_server"]


@contextmanager
def jupyter_server(notebook_path):
server = ServerJupyter(notebook_path, get_free_port(), "localhost")
try:
server.serve_threaded()
server.wait_until_serving()
yield server
finally:
server.stop_serving()
33 changes: 33 additions & 0 deletions jupyter_output_monitor/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import datetime
import socket

from nbconvert import NotebookExporter
from traitlets.config import Config

__all__ = ["get_free_port", "clear_notebook", "isotime"]


def get_free_port():
"""Return a free port number."""
sock = socket.socket()
sock.bind(("", 0))
return sock.getsockname()[1]


def clear_notebook(input_notebook, output_notebook):
"""Write out a copy of the notebook with output and metadata removed."""
c = Config()
c.NotebookExporter.preprocessors = [
"nbconvert.preprocessors.ClearOutputPreprocessor",
"nbconvert.preprocessors.ClearMetadataPreprocessor",
]

exporter = NotebookExporter(config=c)
body, resources = exporter.from_filename(input_notebook)

with open(output_notebook, "w") as f:
f.write(body)


def isotime():
return datetime.datetime.now().isoformat()
17 changes: 13 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,17 @@ find = {namespaces = false}
write_to = "jupyter_output_monitor/_version.py"

[tool.ruff]
lint.select = [
"B", # flake8-bugbear
"I", # isort
"UP", # pyupgrade
lint.select = ["ALL"]
lint.ignore = [
"A00",
"ANN",
"T201",
"PTH",
"D100",
"D103",
"D104",
"C901",
"PLR0915",
"DTZ",
"E501"
]

0 comments on commit 7aed9be

Please sign in to comment.