diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 90809b5..24c6b13 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,17 @@ # Contributing + +* [Contributing](#contributing) + * [First Principles](#first-principles) + * [Common fixes for `mypy` errors](#common-fixes-for-mypy-errors) + * [..., expression has type "None", variable has type "str"](#-expression-has-type-none-variable-has-type-str) + * [..., has incompatible type "Path"; expected "str"](#-has-incompatible-type-path-expected-str) + * [Argument 2 to "get" of "dict" has incompatible type "None"; expected ...](#argument-2-to-get-of-dict-has-incompatible-type-none-expected-) + * [Local Setup](#local-setup) + * [First contribution](#first-contribution) + * [Troubleshooting](#troubleshooting) + + ## First Principles Favoring standard libraries over external dependencies, especially in specific contexts like Databricks, is a best practice in software @@ -114,4 +126,9 @@ Here are the example steps to submit your first contribution: ## Troubleshooting -If you encounter any package dependency errors after `git pull`, run `make clean` +If you encounter any package dependency errors after `git pull`, run `make clean dev` + +### Running in isolation + +See https://pylint.pycqa.org/en/latest/development_guide/how_tos/custom_checkers.html#testing-a-checker +`pylint --load-plugins=databricks.labs.pylint --disable=all --enable= test.py` \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index ce45d63..04f4bea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,13 +54,13 @@ python="3.10" path = ".venv" [tool.hatch.envs.default.scripts] -test = "pytest -n 2 --cov src --cov-report=xml --timeout 30 tests/unit --durations 20" +test = "pytest -n 2 --cov src --cov-report=xml --timeout 30 tests --durations 20" coverage = "pytest -n 2 --cov src tests/unit --timeout 30 --cov-report=html --durations 20" integration = "pytest -n 10 --cov src tests/integration --durations 20" fmt = ["isort .", "ruff format", - "ruff . --fix", - "mypy .", + "ruff check . --fix", + #"mypy .", "pylint --output-format=colorized -j 0 src"] verify = ["black --check .", "isort . --check-only", @@ -68,7 +68,10 @@ verify = ["black --check .", "mypy .", "pylint --output-format=colorized -j 0 src"] -[tool.isort] +[tool.mypy] +exclude = ["tests/samples"] + +[tool.lint.isort] profile = "black" [tool.pytest.ini_options] @@ -85,9 +88,13 @@ cache-dir = ".venv/ruff-cache" target-version = "py310" line-length = 120 -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["databricks.labs.pylint"] +[tool.ruff.lint.per-file-ignores] + +"tests/samples/*" = ["F403", "F405", "E402", "E501", "E722", "E731"] + [tool.coverage.run] branch = true parallel = true @@ -145,7 +152,7 @@ fail-under = 10.0 # ignore-list. The regex matches against paths and can be in Posix or Windows # format. Because '\\' represents the directory delimiter on Windows systems, it # can't be used as an escape character. -# ignore-paths = +ignore-paths = ["tests/samples"] # Files or directories matching the regular expression patterns are skipped. The # regex matches against base names, not paths. The default value ignores Emacs diff --git a/src/databricks/labs/pylint/__init__.py b/src/databricks/labs/pylint/__init__.py index 4e42e96..c57d153 100644 --- a/src/databricks/labs/pylint/__init__.py +++ b/src/databricks/labs/pylint/__init__.py @@ -1 +1,6 @@ -from databricks.labs.pylint.__about__ import __version__ \ No newline at end of file +def register(linter): + from databricks.labs.pylint.dbutils import DbutilsChecker + from databricks.labs.pylint.notebooks import NotebookChecker + + linter.register_checker(NotebookChecker(linter)) + linter.register_checker(DbutilsChecker(linter)) diff --git a/src/databricks/labs/pylint/dbutils.py b/src/databricks/labs/pylint/dbutils.py new file mode 100644 index 0000000..d8e04f6 --- /dev/null +++ b/src/databricks/labs/pylint/dbutils.py @@ -0,0 +1,86 @@ +# pylint checker for databricks dbutils +import astroid +from pylint.checkers import BaseChecker + + +class DbutilsChecker(BaseChecker): + name = "dbutils" + + msgs = { + "E9899": ( + "Use Databricks SDK instead: w.dbfs.copy(%s, %s)", + "dbutils-fs-cp", + "Migrate all usage of dbutils to Databricks SDK", + ), + "E9898": ( + "Use Databricks SDK instead: with w.dbfs.download(%s) as f: f.read()", + "dbutils-fs-head", + "Migrate all usage of dbutils to Databricks SDK", + ), + "E9897": ( + "Use Databricks SDK instead: w.dbfs.list(%s)", + "dbutils-fs-ls", + "Migrate all usage of dbutils to Databricks SDK", + ), + "E9896": ( + "Mounts are not supported with Unity Catalog, switch to using Unity Catalog Volumes instead", + "dbutils-fs-mount", + "Migrate all usage to Unity Catalog", + ), + "E9889": ( + "Credentials utility is not supported with Unity Catalog", + "dbutils-credentials", + "Migrate all usage to Unity Catalog", + ), + "E9879": ( + """Use Databricks SDK instead: w.jobs.submit( + tasks=[jobs.SubmitTask(existing_cluster_id=..., + notebook_task=jobs.NotebookTask(notebook_path=%s), + task_key=...) + ]).result(timeout=timedelta(minutes=%s))""", + "dbutils-notebook-run", + "Migrate all usage of dbutils to Databricks SDK", + ), + "E9869": ( + "Use Databricks SDK instead: from databricks.sdk import WorkspaceClient(); w = WorkspaceClient()", + "pat-token-leaked", + "Do not hardcode secrets in code, use Databricks Scopes instead", + ), + } + + def visit_call(self, node: astroid.Call): + # add message if dbutils.fs.cp() is used + if node.func.as_string() == "dbutils.fs.cp": + self.add_message("dbutils-fs-cp", node=node, args=(node.args[0].as_string(), node.args[1].as_string())) + # add message if dbutils.fs.head() is used + if node.func.as_string() == "dbutils.fs.head": + self.add_message("dbutils-fs-head", node=node, args=(node.args[0].as_string(),)) + # add message if dbutils.fs.ls("/tmp") is used + if node.func.as_string() == "dbutils.fs.ls": + self.add_message("dbutils-fs-ls", node=node, args=(node.args[0].as_string(),)) + # add message if dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name) is used + if node.func.as_string() in { + "dbutils.fs.mount", + "dbutils.fs.mounts", + "dbutils.fs.unmount", + "dbutils.fs.updateMount", + "dbutils.fs.refreshMounts", + }: + self.add_message("dbutils-fs-mount", node=node) + # add message if dbutils.credentials.* is used + if node.func.as_string().startswith("dbutils.credentials."): + self.add_message("dbutils-credentials", node=node) + # add message if dbutils.notebook.run("My Other Notebook", 60) is used + if node.func.as_string() == "dbutils.notebook.run": + self.add_message( + "dbutils-notebook-run", node=node, args=(node.args[0].as_string(), node.args[1].as_string()) + ) + + def visit_const(self, node: astroid.Const): + # add a message if string matches dapi[0-9a-f]{32}, dkea[0-9a-f]{32}, or dosa[0-9a-f]{32} + if node.value.startswith("dapi") or node.value.startswith("dkea") or node.value.startswith("dosa"): + self.add_message("pat-token-leaked", node=node) + + +def register(linter): + linter.register_checker(DbutilsChecker(linter)) diff --git a/src/databricks/labs/pylint/notebooks.py b/src/databricks/labs/pylint/notebooks.py index ea5efa1..e3e2300 100644 --- a/src/databricks/labs/pylint/notebooks.py +++ b/src/databricks/labs/pylint/notebooks.py @@ -1,44 +1,36 @@ import astroid -from pylint.checkers import BaseChecker, BaseRawFileChecker +from pylint.checkers import BaseRawFileChecker class NotebookChecker(BaseRawFileChecker): __implements__ = (BaseRawFileChecker,) - name = 'databricks-notebooks' + name = "databricks-notebooks" msgs = { - 'E9999': ( - 'dbutils.notebook.run() is not allowed', - 'notebooks-dbutils-run', - 'Used when dbutils.notebook.run() is used' + "E9996": ( + "Notebooks should not have more than 75 cells", + "notebooks-too-many-cells", + "Used when the number of cells in a notebook is greater than 75", ), - 'E9998': ( - 'dbutils.fs is not allowed', - 'notebooks-dbutils-fs', - 'Used when dbutils.fs is used' - ), - 'E9997': ( - 'dbutils.credentials is not allowed', - 'notebooks-dbutils-credentials', - 'Used when dbutils.credentials is used' - ), - 'E9996': ( - 'Notebooks should not have more than 75 cells', - 'notebooks-too-many-cells', - 'Used when the number of cells in a notebook is greater than 75' - ), - 'E9995': ( - 'Star import is not allowed', - 'notebooks-star-import', - 'Used when there is a star import from pyspark.sql.functions' - ), - 'E9994': ( - 'Using %run is not allowed', - 'notebooks-percent-run', - 'Used when `# MAGIC %run` comment is used', + "E9994": ( + "Using %run is not allowed", + "notebooks-percent-run", + "Used when `# MAGIC %run` comment is used", ), } + options = ( + ( + "max-cells", + { + "default": 75, + "type": "int", + "metavar": "", + "help": "Maximum number of cells in the notebook", + }, + ), + ) + def process_module(self, node: astroid.Module): """Read raw module. Need to do some tricks, as `ast` doesn't provide access for comments. @@ -51,41 +43,19 @@ def process_module(self, node: astroid.Module): """ cells = 1 with node.stream() as stream: - for (lineno, line) in enumerate(stream): + for lineno, line in enumerate(stream): lineno += 1 - if lineno == 1 and line != b'# Databricks notebook source\n': + if lineno == 1 and line != b"# Databricks notebook source\n": # this is not a Databricks notebook return - if line == b'# COMMAND ----------\n': + if line == b"# COMMAND ----------\n": cells += 1 - if cells > 75: - self.add_message('notebooks-too-many-cells', line=lineno) + if cells > self.linter.config.max_cells: + self.add_message("notebooks-too-many-cells", line=lineno) continue - if line.startswith(b'# MAGIC %run'): - self.add_message('notebooks-percent-run', line=lineno) - - def visit_module(self, node): - # add message if dbutils.notebook.run() is used - if node.name == 'dbutils.notebook.run': - self.add_message('notebooks-dbutils-run', node=node) - - # add message if dbutils.fs is used - if node.name == 'dbutils.fs': - self.add_message('notebooks-dbutils-fs', node=node) - - # add message if dbutils.credentials is used - if node.name == 'dbutils.credentials': - self.add_message('notebooks-dbutils-credentials', node=node) - - # Notebooks should not have more than 75 cells. - if len(node.body) > 75: - self.add_message('notebooks-too-many-cells', node=node) - - def visit_importfrom(self, node: astroid.ImportFrom): - # add message if there's a star import from pyspark.sql.functions import * - if node.modname == 'pyspark.sql.functions' and node.names[0][0] == '*': - self.add_message('notebooks-star-import', node=node) + if line.startswith(b"# MAGIC %run"): + self.add_message("notebooks-percent-run", line=lineno) def register(linter): - linter.register_checker(NotebookChecker(linter)) \ No newline at end of file + linter.register_checker(NotebookChecker(linter)) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..47f1701 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,45 @@ +from typing import Generic, TypeVar + +import astroid +import astroid.rebuilder +import pytest +from pylint.checkers import BaseChecker +from pylint.testutils import UnittestLinter + +T = TypeVar("T", bound=BaseChecker) + + +class TestSupport(Generic[T]): + def __init__(self, klass: type[T]): + linter = UnittestLinter() + checker = klass(linter) + checker.open() + linter.register_checker(checker) + + self._checker = checker + self._linter = linter + + def __lshift__(self, code: str): + node = astroid.extract_node(code) + + klass_name = node.__class__.__name__ + visitor = astroid.rebuilder.REDIRECT.get(klass_name, klass_name).lower() + getattr(self._checker, f"visit_{visitor}")(node) + + out = set() + for message in self._linter.release_messages(): + for message_definition in self._linter.msgs_store.get_message_definitions(message.msg_id): + user_message = message_definition.msg + if message.args: + user_message %= message.args + out.add(f"[{message.msg_id}] {user_message}") + + return out + + +@pytest.fixture +def lint_with(): + def factory(klass: type[T]) -> TestSupport[T]: + return TestSupport(klass) + + yield factory diff --git a/tests/samples/TestForPylint.py b/tests/samples/TestForPylint.py index f476a98..8f22b58 100644 --- a/tests/samples/TestForPylint.py +++ b/tests/samples/TestForPylint.py @@ -10,7 +10,6 @@ # and here we do star import from pyspark.sql.functions import * - # # COMMAND ---------- # # # but no dbutils.library.restartPython() @@ -22,9 +21,7 @@ # COMMAND ---------- -df = spark \ - .table('samples.nyctaxi.trips') \ - .limit(10) +df = spark.table("samples.nyctaxi.trips").limit(10) display(df) # COMMAND ---------- @@ -33,7 +30,5 @@ # COMMAND ---------- -df = (spark - .table('samples.nyctaxi.trips') - .limit(10)) +df = spark.table("samples.nyctaxi.trips").limit(10) display(df) diff --git a/tests/test_dbutils.py b/tests/test_dbutils.py new file mode 100644 index 0000000..1fc8c8e --- /dev/null +++ b/tests/test_dbutils.py @@ -0,0 +1,56 @@ +from databricks.labs.pylint.dbutils import DbutilsChecker + + +def test_checks_cp(lint_with): + messages = lint_with(DbutilsChecker) << "dbutils.fs.cp('foo', 'bar')" + assert "[dbutils-fs-cp] Use Databricks SDK instead: w.dbfs.copy('foo', 'bar')" in messages + + +def test_checks_head(lint_with): + messages = lint_with(DbutilsChecker) << """dbutils.fs.head("/tmp/my_file.txt", 25)""" + assert ( + "[dbutils-fs-head] Use Databricks SDK instead: with " + "w.dbfs.download('/tmp/my_file.txt') as f: f.read()" in messages + ) + + +def test_checks_list(lint_with): + messages = lint_with(DbutilsChecker) << """dbutils.fs.ls("/tmp")""" + assert "[dbutils-fs-ls] Use Databricks SDK instead: w.dbfs.list('/tmp')" in messages + + +def test_checks_mount(lint_with): + messages = ( + lint_with(DbutilsChecker) + << """aws_bucket_name = "my-bucket" +mount_name = "s3-my-bucket" +dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name)""" + ) + assert ( + "[dbutils-fs-mount] Mounts are not supported with Unity Catalog, " + "switch to using Unity Catalog Volumes instead" + ) in messages + + +def test_checks_credentials(lint_with): + messages = lint_with(DbutilsChecker) << """dbutils.credentials.assumeRole("arn:aws:iam::...:roles/my-role")""" + assert "[dbutils-credentials] Credentials utility is not supported with Unity Catalog" in messages + + +def test_checks_notebook_run(lint_with): + messages = lint_with(DbutilsChecker) << """dbutils.notebook.run("my_notebook", 5)""" + assert len(messages) == 1 + + +def test_checks_secrets(lint_with): + messages = ( + lint_with(DbutilsChecker) + << """do_something( + 'dapi00000000', #@ + 'other', + )""" + ) + assert ( + "[pat-token-leaked] Use Databricks SDK instead: from databricks.sdk import " + "WorkspaceClient(); w = WorkspaceClient()" + ) in messages diff --git a/tests/test_notebooks.py b/tests/test_notebooks.py index e6af9da..d96a8be 100644 --- a/tests/test_notebooks.py +++ b/tests/test_notebooks.py @@ -1,54 +1,14 @@ -import astroid -import pylint.testutils - -from pylint.lint.pylinter import PyLinter +from pylint.testutils import UnittestLinter from databricks.labs.pylint.notebooks import NotebookChecker def test_percent_run(): - linter = PyLinter() + linter = UnittestLinter() checker = NotebookChecker(linter) - linter.disable("all") linter.register_checker(checker) linter.check(["samples/TestForPylint.py"]) - assert 1 == len(linter.reporter.messages) - - -class TestNotebookChecker(pylint.testutils.CheckerTestCase): - CHECKER_CLASS = NotebookChecker - - def test_import_from_pyspark(self): - node = astroid.extract_node(""" - from pyspark.sql.functions import * - """) - - with self.assertAddsMessages( - pylint.testutils.MessageTest( - msg_id="notebooks-star-import", - node=node, - ), ignore_position=True - ): - self.checker.visit_importfrom(node) - - def test_percent_run(self): - node = astroid.extract_node("""# Databricks notebook source -# MAGIC %md # Here's markdown cell - -# COMMAND ---------- - -# MAGIC %run ./something - -# COMMAND ---------- - -print('hello') - """) + msgs = {_.msg_id for _ in linter.release_messages()} - with self.assertAddsMessages( - pylint.testutils.MessageTest( - msg_id="notebooks-percent-run", - node=node, - ), ignore_position=True - ): - self.checker.process_module(node) + assert "notebooks-percent-run" in msgs