From f192fb3c05da09d4b5f8cc90ef5740ee9aef07bd Mon Sep 17 00:00:00 2001 From: Ran Ran Date: Thu, 4 Apr 2024 14:49:28 -0700 Subject: [PATCH] Increase fail score for pylint (#240) --- .pylintrc | 1 + dags/vm_resource.py | 30 +++++++++++++++++++++++------- scripts/code-style.sh | 2 +- xlml/apis/gcp_config.py | 3 ++- xlml/apis/metric_config.py | 9 ++++----- xlml/apis/task.py | 10 ++++++---- xlml/apis/test_config.py | 6 ++++-- xlml/utils/bigquery.py | 5 +++-- xlml/utils/bigquery_test.py | 2 ++ xlml/utils/gke.py | 2 ++ xlml/utils/gpu.py | 4 ++-- xlml/utils/metric_test.py | 15 ++++++++------- xlml/utils/tpu.py | 4 ++-- xlml/utils/xpk.py | 4 +--- 14 files changed, 61 insertions(+), 36 deletions(-) diff --git a/.pylintrc b/.pylintrc index 5c771e62..d3f6404c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -308,6 +308,7 @@ redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functool # Logging modules to check that the string format arguments are in logging # function parameter format logging-modules=logging,absl.logging,tensorflow.io.logging +disable=logging-fstring-interpolation [SIMILARITIES] diff --git a/dags/vm_resource.py b/dags/vm_resource.py index 157b3886..89c40f19 100644 --- a/dags/vm_resource.py +++ b/dags/vm_resource.py @@ -14,8 +14,8 @@ """The file for common projects, zone, and runtime versions.""" -import enum import datetime +import enum V5_NETWORKS_PREFIX = "projects/tpu-prod-env-automated" @@ -55,7 +55,8 @@ class Zone(enum.Enum): US_CENTRAL1_B = "us-central1-b" # reserved v4-8 & v4-32 in cloud-ml-auto-solutions US_CENTRAL2_B = "us-central2-b" - # reserved/on-demand v2-8 in cloud-ml-auto-solutions & reserved h100 in supercomputer-testing + # reserved/on-demand v2-8 in cloud-ml-auto-solutions + # & reserved h100 in supercomputer-testing US_CENTRAL1_C = "us-central1-c" # committed resource for A100 US_CENTRAL1_F = "us-central1-f" @@ -137,8 +138,23 @@ class DockerImage(enum.Enum): """Common docker images.""" XPK_JAX_TEST = "gcr.io/cloud-ml-auto-solutions/xpk_jax_test:latest" - PYTORCH_NIGHTLY = f"us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_{datetime.datetime.today().strftime('%Y%m%d')}" - MAXTEXT_TPU_JAX_STABLE = f"gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:{datetime.datetime.today().strftime('%Y-%m-%d')}" - MAXTEXT_TPU_JAX_NIGHTLY = f"gcr.io/tpu-prod-env-multipod/maxtext_jax_nightly:{datetime.datetime.today().strftime('%Y-%m-%d')}" - MAXTEXT_GPU_JAX_STABLE = f"gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable:{datetime.datetime.today().strftime('%Y-%m-%d')}" - MAXTEXT_GPU_JAX_NIGHTLY = f"gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_nightly:{datetime.datetime.today().strftime('%Y-%m-%d')}" + PYTORCH_NIGHTLY = ( + "us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/" + f"xla:nightly_3.10_tpuvm_{datetime.datetime.today().strftime('%Y%m%d')}" + ) + MAXTEXT_TPU_JAX_STABLE = ( + "gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:" + f"{datetime.datetime.today().strftime('%Y-%m-%d')}" + ) + MAXTEXT_TPU_JAX_NIGHTLY = ( + "gcr.io/tpu-prod-env-multipod/maxtext_jax_nightly:" + f"{datetime.datetime.today().strftime('%Y-%m-%d')}" + ) + MAXTEXT_GPU_JAX_STABLE = ( + "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_stable:" + f"{datetime.datetime.today().strftime('%Y-%m-%d')}" + ) + MAXTEXT_GPU_JAX_NIGHTLY = ( + "gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_nightly:" + f"{datetime.datetime.today().strftime('%Y-%m-%d')}" + ) diff --git a/scripts/code-style.sh b/scripts/code-style.sh index 576fff94..940f80e8 100755 --- a/scripts/code-style.sh +++ b/scripts/code-style.sh @@ -26,7 +26,7 @@ done for folder in "${FOLDERS_TO_FORMAT[@]}" do - pylint "./$folder" --fail-under=9 + pylint "./$folder" --fail-under=9.6 done echo "Successfully clean up all codes." diff --git a/xlml/apis/gcp_config.py b/xlml/apis/gcp_config.py index a79c12c6..501ffcf6 100644 --- a/xlml/apis/gcp_config.py +++ b/xlml/apis/gcp_config.py @@ -15,8 +15,9 @@ """Config file for Google Cloud Project (GCP).""" import dataclasses -from xlml.apis import metric_config + from dags.vm_resource import Project +from xlml.apis import metric_config @dataclasses.dataclass diff --git a/xlml/apis/metric_config.py b/xlml/apis/metric_config.py index eb13885b..6fe27e30 100644 --- a/xlml/apis/metric_config.py +++ b/xlml/apis/metric_config.py @@ -43,7 +43,7 @@ class SshEnvVars(enum.Enum): @dataclasses.dataclass class JSONLinesConfig: - """This is a class to set up JSON Lines config. + """A class to set up JSON Lines config. Attributes: file_location: The locatioin of the file in GCS. When @@ -55,7 +55,7 @@ class JSONLinesConfig: @dataclasses.dataclass class SummaryConfig: - """This is a class to set up TensorBoard summary config. + """A class to set up TensorBoard summary config. Attributes: file_location: The locatioin of the file in GCS. When @@ -79,7 +79,7 @@ class SummaryConfig: @dataclasses.dataclass class ProfileConfig: - """This is a class to set up profile config. + """A class to set up profile config. Attributes: file_locations: The locatioin of the file in GCS. When @@ -93,8 +93,7 @@ class ProfileConfig: @dataclasses.dataclass class MetricConfig: - """This is a class to set up config of Benchmark metric, - dimension, and profile. + """A class to set up config of Benchmark metric, dimension, and profile. Attributes: json_lines: The config for JSON Lines input. diff --git a/xlml/apis/task.py b/xlml/apis/task.py index 14b73cc8..6354c3a3 100644 --- a/xlml/apis/task.py +++ b/xlml/apis/task.py @@ -18,12 +18,12 @@ import dataclasses import datetime import shlex -from typing import Any, Dict, Optional, Tuple, Union +from typing import Optional, Tuple, Union import airflow from airflow.models.taskmixin import DAGNode from airflow.utils.task_group import TaskGroup from xlml.apis import gcp_config, metric_config, test_config -from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke, startup_script +from xlml.utils import gpu, metric, name_format, ssh, tpu, xpk, gke class BaseTask(abc.ABC): @@ -47,6 +47,7 @@ class TpuQueuedResourceTask(BaseTask): task_test_config: Test configs to run on this TPU. task_gcp_config: Runtime TPU creation parameters. task_metric_config: Metric configs to process metrics. + tpu_create_timeout: Time to provision the machine. tpu_name_env_var: The flag to define if set up env variable for tpu name. all_workers: The flag to define if run commands on all workers or worker 0 only. @@ -71,8 +72,9 @@ def run(self) -> DAGNode: group_id=self.task_test_config.benchmark_id, prefix_group_id=True ) as group: provision, queued_resource, ssh_keys, gcs_location = self.provision() - # If you didn't set `MetricConfig.use_runtime_generated_gcs_folder` value in the - # test config script then `gcs_location` will take no effect. + # If you didn't set `MetricConfig.use_runtime_generated_gcs_folder` + # value in the test config script then `gcs_location` will take + # no effect. if ( self.task_metric_config and self.task_metric_config.use_runtime_generated_gcs_folder diff --git a/xlml/apis/test_config.py b/xlml/apis/test_config.py index 8c3f6b67..aec85de0 100644 --- a/xlml/apis/test_config.py +++ b/xlml/apis/test_config.py @@ -44,13 +44,14 @@ def __init__(self, accelerator, task_owner=None, test_name): """ import abc -import attrs -from dags.vm_resource import TpuVersion import json import os import shlex from typing import Any, Generic, Iterable, List, Optional, TypeVar +import attrs +from dags.vm_resource import TpuVersion + class Accelerator(abc.ABC): """Represents an ML accelerator.""" @@ -124,6 +125,7 @@ class TestConfig(abc.ABC, Generic[A]): accelerator: Accelerator type required for this test. time_out_in_min: Test timeout in minutes. task_owner: Task owner username or link. + gcs_subfolder: Subfolder name for default GCS bucket. """ accelerator: A diff --git a/xlml/utils/bigquery.py b/xlml/utils/bigquery.py index f8d2f83a..cde288c8 100644 --- a/xlml/utils/bigquery.py +++ b/xlml/utils/bigquery.py @@ -20,11 +20,11 @@ import enum import math from typing import Iterable, Optional + from absl import logging -from xlml.apis import metric_config import google.auth from google.cloud import bigquery - +from xlml.apis import metric_config BENCHMARK_BQ_JOB_TABLE_NAME = "job_history" BENCHMARK_BQ_METRIC_TABLE_NAME = "metric_history" @@ -73,6 +73,7 @@ class BigQueryMetricClient: Attributes: project: The project name for database. database: The database name for BigQuery. + client: The client for BigQuery Metric. """ def __init__( diff --git a/xlml/utils/bigquery_test.py b/xlml/utils/bigquery_test.py index 41b143b0..03924453 100644 --- a/xlml/utils/bigquery_test.py +++ b/xlml/utils/bigquery_test.py @@ -71,6 +71,7 @@ def test_is_valid_metric(self, x: float, expected_value: bool): bigquery.Client, "insert_rows", return_value=["there is an error"] ) def test_insert_failure(self, default, get_table, insert_rows): + del default, get_table, insert_rows bq_metric = test_bigquery.BigQueryMetricClient() self.assertRaises(RuntimeError, bq_metric.insert, self.test_runs) @@ -80,6 +81,7 @@ def test_insert_failure(self, default, get_table, insert_rows): @mock.patch.object(bigquery.Client, "get_table", return_value="mock_table") @mock.patch.object(bigquery.Client, "insert_rows", return_value=[]) def test_insert_success(self, default, get_table, insert_rows): + del default, get_table, insert_rows bq_metric = test_bigquery.BigQueryMetricClient() bq_metric.insert(self.test_runs) diff --git a/xlml/utils/gke.py b/xlml/utils/gke.py index c77d0a91..a35fd0b3 100644 --- a/xlml/utils/gke.py +++ b/xlml/utils/gke.py @@ -13,6 +13,8 @@ from xlml.apis import gcp_config +"""Utilities for GKE.""" + def get_authenticated_client( project_name: str, region: str, cluster_name: str diff --git a/xlml/utils/gpu.py b/xlml/utils/gpu.py index ff3eb6a5..b6c7abbd 100644 --- a/xlml/utils/gpu.py +++ b/xlml/utils/gpu.py @@ -280,7 +280,7 @@ def wait_for_resource_creation(operation_name: airflow.XComArg): operation.http_error_message ) elif operation.warnings: - logging.warning(f"Warnings during resource creation:\n") + logging.warning("Warnings during resource creation:\n") for warning in operation.warnings: logging.warning(f" - {warning.code}: {warning.message}") return True @@ -388,7 +388,7 @@ def wait_for_resource_deletion(operation_name: airflow.XComArg): operation.http_error_message ) elif operation.warnings: - logging.warning(f"Warnings during resource deletion:\n") + logging.warning("Warnings during resource deletion:\n") for warning in operation.warnings: logging.warning(f" - {warning.code}: {warning.message}") return True diff --git a/xlml/utils/metric_test.py b/xlml/utils/metric_test.py index c93f0d94..7ec4f99b 100644 --- a/xlml/utils/metric_test.py +++ b/xlml/utils/metric_test.py @@ -29,9 +29,6 @@ from dags.vm_resource import TpuVersion, RuntimeVersion -"""Tests for Benchmark metric.py.""" - - class BenchmarkMetricTest(parameterized.TestCase, absltest.TestCase): def get_tempdir(self): @@ -152,7 +149,7 @@ def test_aggregate_metrics( self.assertAlmostEqual(actual_value, expected_value) @mock.patch("xlml.utils.metric.download_object_from_gcs") - def test_process_json_lines(self, download_object_from_gcs): + def test_process_json_lines(self, _): path = "/tmp/ml-auto-solutions-metrics.jsonl" test_run1 = { "metrics": {"accuracy": 0.95, "MFU": 0.50}, @@ -273,10 +270,10 @@ def test_add_airflow_metadata(self): "COMPOSER_LOCATION": "test_location", "COMPOSER_ENVIRONMENT": "test_env", }, - ) as mock_variable: + ) as _: with mock.patch.object( composer, "get_airflow_url", return_value="http://airflow" - ) as mock_object: + ) as _: raw_meta = [ [ bigquery.MetadataHistoryRow( @@ -313,7 +310,11 @@ def test_add_airflow_metadata(self): bigquery.MetadataHistoryRow( job_uuid=uuid, metadata_key="airflow_dag_run_link", - metadata_value="http://airflow/dags/benchmark_test/grid?dag_run_id=manual__2023-08-07T21%3A03%3A49.181263%2B00%3A00&task_id=post_process", + metadata_value=( + "http://airflow/dags/benchmark_test/grid?" + "dag_run_id=manual__2023-08-07T21%3A03%3A49." + "181263%2B00%3A00&task_id=post_process" + ), ) ) diff --git a/xlml/utils/tpu.py b/xlml/utils/tpu.py index d9ed7db1..927dc8a1 100644 --- a/xlml/utils/tpu.py +++ b/xlml/utils/tpu.py @@ -166,7 +166,7 @@ def create_queued_resource_request( queued_resource=queued_resource, ) response = qr_operation.result() - logging.info('Create QR response: {}'.format(response)) + logging.info(f'Create QR response: {response}') # TODO(wcromar): do anything about failures return response.name @@ -248,7 +248,7 @@ def delete_tpu_nodes_request(qualified_name: str): for node in qr.tpu.node_spec: try: op = client.delete_node(name=f'{node.parent}/nodes/{node.node_id}') - logging.info('Delete node state: {}'.format(op)) + logging.info(f'Delete node state: {op}') except google.api_core.exceptions.NotFound: logging.info(f'{node.node_id} is already deleted') diff --git a/xlml/utils/xpk.py b/xlml/utils/xpk.py index 94c2f674..4fd03ff6 100644 --- a/xlml/utils/xpk.py +++ b/xlml/utils/xpk.py @@ -21,9 +21,7 @@ from airflow.decorators import task from airflow.exceptions import AirflowFailException from airflow.hooks.subprocess import SubprocessHook -from datetime import timedelta from kubernetes import client as k8s_client -from kubernetes.client import models as k8s_models from xlml.apis import metric_config from xlml.utils import gke from dags.vm_resource import GpuVersion @@ -136,7 +134,7 @@ def wait_for_workload_completion( return False if any(pod.status.phase in ["Pending", "Running"] for pod in pods.items): - logging.info(f"At least one pod has yet to complete") + logging.info("At least one pod has yet to complete.") return False try: