From bc5c2c98041cd938d16f2ebd327bd58f950c7d0b Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 27 Nov 2023 13:50:14 -0800 Subject: [PATCH 1/5] client: cache specs by projects, fix mem leak --- .../mpcontribs/client/__init__.py | 133 ++++++++++++------ 1 file changed, 89 insertions(+), 44 deletions(-) diff --git a/mpcontribs-client/mpcontribs/client/__init__.py b/mpcontribs-client/mpcontribs/client/__init__.py index 664cbf4b0..178970986 100644 --- a/mpcontribs-client/mpcontribs/client/__init__.py +++ b/mpcontribs-client/mpcontribs/client/__init__.py @@ -39,7 +39,9 @@ from bravado.requests_client import RequestsClient from bravado.swagger_model import Loader from bravado.config import bravado_config_from_config_dict -from bravado_core.spec import Spec +from bravado_core.spec import Spec, build_api_serving_url, _identity +from bravado_core.model import model_discovery +from bravado_core.resource import build_resources from bravado.exception import HTTPNotFound from bravado_core.validate import validate_object from json2html import Json2Html @@ -203,6 +205,16 @@ def validate_url(url_string, qualifying=("scheme", "netloc")): url_format = SwaggerFormat( format="url", to_wire=str, to_python=str, validate=validate_url, description="URL", ) +bravado_config_dict = { + "validate_responses": False, + "use_models": False, + "include_missing_properties": False, + "formats": [email_format, url_format], +} +bravado_config = bravado_config_from_config_dict(bravado_config_dict) +for key in set(bravado_config._fields).intersection(set(bravado_config_dict)): + del bravado_config_dict[key] +bravado_config_dict["bravado"] = bravado_config # https://stackoverflow.com/a/8991553 @@ -641,9 +653,37 @@ def _run_futures(futures, total: int = 0, timeout: int = -1, desc=None, disable= @functools.lru_cache(maxsize=1000) def _load(protocol, host, headers_json, project, version): + spec_dict = _raw_specs(protocol, host, version) + if not spec_dict["paths"]: + url = f"{protocol}://{host}" + origin_url = f"{url}/apispec.json" + http_client = RequestsClient() + swagger_spec = Spec.from_dict(spec_dict, origin_url, http_client, bravado_config_dict) + http_client.session.close() + return swagger_spec + + # retrieve list of projects accessible to user headers = ujson.loads(headers_json) + query = {"name": project} if project else {} + query["_fields"] = ["name"] + url = f"{protocol}://{host}" + resp = requests.get(f"{url}/projects/", params=query, headers=headers).json() + + if not resp or not resp["data"]: + raise MPContribsClientError(f"Failed to load projects for query {query}!") + + if project and not resp["data"]: + raise MPContribsClientError(f"{project} doesn't exist, or access denied!") + + projects = sorted(d["name"] for d in resp["data"]) + projects_json = ujson.dumps(projects) + # expand regex-based query parameters for `data` columns + return _expand_params(protocol, host, version, projects_json) + + +@functools.lru_cache(maxsize=1) +def _raw_specs(protocol, host, version): http_client = RequestsClient() - http_client.session.headers.update(headers) url = f"{protocol}://{host}" origin_url = f"{url}/apispec.json" url4fn = origin_url.replace("apispec", f"apispec-{version}").encode('utf-8') @@ -668,35 +708,19 @@ def _load(protocol, host, headers_json, project, version): spec_dict["host"] = host spec_dict["schemes"] = [protocol] - - config = { - "validate_responses": False, - "use_models": False, - "include_missing_properties": False, - "formats": [email_format, url_format], - } - bravado_config = bravado_config_from_config_dict(config) - for key in set(bravado_config._fields).intersection(set(config)): - del config[key] - config["bravado"] = bravado_config - swagger_spec = Spec.from_dict(spec_dict, origin_url, http_client, config) - - if not spec_dict["paths"]: - return swagger_spec - - # expand regex-based query parameters for `data` columns - query = {"name": project} if project else {} - query["_fields"] = ["columns"] - resp = http_client.session.get(f"{url}/projects/", params=query).json() http_client.session.close() + return spec_dict - if not resp or not resp["data"]: - raise MPContribsClientError(f"Failed to load projects for query {query}!") - - if project and not resp["data"]: - raise MPContribsClientError(f"{project} doesn't exist, or access denied!") +@functools.lru_cache(maxsize=100) +def _expand_params(protocol, host, version, projects_json): columns = {"string": [], "number": []} + projects = ujson.loads(projects_json) + query = {"project__in": projects} + query["_fields"] = ["columns"] + url = f"{protocol}://{host}" + http_client = RequestsClient() + resp = http_client.session.get(f"{url}/projects/", params=query).json() for proj in resp["data"]: for column in proj["columns"]: @@ -708,29 +732,50 @@ def _load(protocol, host, headers_json, project, version): col = f"{col}__value" columns["number"].append(col) - resource = swagger_spec.resources["contributions"] + spec_dict = _raw_specs(protocol, host, version) + resource = spec_dict["paths"]["/contributions/"]["get"] + raw_params = resource.pop("parameters") + params = {} - for operation_id, operation in resource.operations.items(): - for pn in list(operation.params.keys()): - if pn.startswith("data_"): - param = operation.params.pop(pn) - op = param.name.rsplit('$__', 1)[-1] - typ = param.param_spec.get("type") - key = "number" if typ == "number" else "string" + for param in raw_params: + if param["name"].startswith("^data__"): + op = param["name"].rsplit('$__', 1)[-1] + typ = param["type"] + key = "number" if typ == "number" else "string" - for column in columns[key]: - param_name = f"{column}__{op}" + for column in columns[key]: + param_name = f"{column}__{op}" + if param_name not in params: param_spec = { - k: v - for k, v in param.param_spec.items() - if k != "description" + k: v for k, v in param.items() + if k not in ["name", "description"] } param_spec["name"] = param_name - operation.params[param_name] = Param( - swagger_spec, operation, param_spec - ) + params[param_name] = param_spec + else: + params[param["name"]] = param + + resource["parameters"] = list(params.values()) - return swagger_spec + origin_url = f"{url}/apispec.json" + spec = Spec(spec_dict, origin_url, http_client, bravado_config_dict) + model_discovery(spec) + + if spec.config['internally_dereference_refs']: + spec.deref = _identity + spec._internal_spec_dict = spec.deref_flattened_spec + + for user_defined_format in spec.config['formats']: + spec.register_format(user_defined_format) + + spec.resources = build_resources(spec) + spec.api_url = build_api_serving_url( + spec_dict=spec.spec_dict, + origin_url=spec.origin_url, + use_spec_url_for_base_path=spec.config['use_spec_url_for_base_path'], + ) + http_client.session.close() + return spec @functools.lru_cache(maxsize=1) From 0e80835f1beb73ea0f225478190118165e3e8d6f Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 27 Nov 2023 13:54:21 -0800 Subject: [PATCH 2/5] remove unused import --- mpcontribs-client/mpcontribs/client/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mpcontribs-client/mpcontribs/client/__init__.py b/mpcontribs-client/mpcontribs/client/__init__.py index 178970986..44279f975 100644 --- a/mpcontribs-client/mpcontribs/client/__init__.py +++ b/mpcontribs-client/mpcontribs/client/__init__.py @@ -18,7 +18,6 @@ from math import isclose from semantic_version import Version from requests.exceptions import RequestException -from bravado_core.param import Param from bson.objectid import ObjectId from typing import Union, Type, List from tqdm.auto import tqdm From ffaeb224c546d66ba904363726fb8898a35fb06d Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 27 Nov 2023 14:03:29 -0800 Subject: [PATCH 3/5] use_inf_as_na deprecated --- mpcontribs-client/mpcontribs/client/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mpcontribs-client/mpcontribs/client/__init__.py b/mpcontribs-client/mpcontribs/client/__init__.py index 44279f975..1c880fcfe 100644 --- a/mpcontribs-client/mpcontribs/client/__init__.py +++ b/mpcontribs-client/mpcontribs/client/__init__.py @@ -7,6 +7,7 @@ import gzip import warnings import pandas as pd +import numpy as np import plotly.io as pio import itertools import functools @@ -88,7 +89,6 @@ j2h = Json2Html() pd.options.plotting.backend = "plotly" -pd.set_option('mode.use_inf_as_na', True) pio.templates.default = "simple_white" warnings.formatwarning = lambda msg, *args, **kwargs: f"{msg}\n" warnings.filterwarnings("default", category=DeprecationWarning, module=__name__) @@ -383,6 +383,7 @@ def from_dict(cls, dct: dict): def _clean(self): """clean the dataframe""" + self.replace([np.inf, -np.inf], np.nan, inplace=True) self.fillna('', inplace=True) self.index = self.index.astype(str) for col in self.columns: From deb44a7816d420b15887cacda3e39d22ad454490 Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 27 Nov 2023 14:26:44 -0800 Subject: [PATCH 4/5] add content type header --- mpcontribs-client/mpcontribs/client/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mpcontribs-client/mpcontribs/client/__init__.py b/mpcontribs-client/mpcontribs/client/__init__.py index 1c880fcfe..385732487 100644 --- a/mpcontribs-client/mpcontribs/client/__init__.py +++ b/mpcontribs-client/mpcontribs/client/__init__.py @@ -658,6 +658,7 @@ def _load(protocol, host, headers_json, project, version): url = f"{protocol}://{host}" origin_url = f"{url}/apispec.json" http_client = RequestsClient() + http_client.session.headers.update(headers) swagger_spec = Spec.from_dict(spec_dict, origin_url, http_client, bravado_config_dict) http_client.session.close() return swagger_spec @@ -720,6 +721,7 @@ def _expand_params(protocol, host, version, projects_json): query["_fields"] = ["columns"] url = f"{protocol}://{host}" http_client = RequestsClient() + http_client.session.headers["Content-Type"] = "application/json" resp = http_client.session.get(f"{url}/projects/", params=query).json() for proj in resp["data"]: From a7c1f8cc790a2ab26c934b8498173ba086974acb Mon Sep 17 00:00:00 2001 From: Patrick Huck Date: Mon, 27 Nov 2023 14:30:44 -0800 Subject: [PATCH 5/5] fix undefined headers var --- mpcontribs-client/mpcontribs/client/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mpcontribs-client/mpcontribs/client/__init__.py b/mpcontribs-client/mpcontribs/client/__init__.py index 385732487..a71f7929f 100644 --- a/mpcontribs-client/mpcontribs/client/__init__.py +++ b/mpcontribs-client/mpcontribs/client/__init__.py @@ -654,6 +654,8 @@ def _run_futures(futures, total: int = 0, timeout: int = -1, desc=None, disable= @functools.lru_cache(maxsize=1000) def _load(protocol, host, headers_json, project, version): spec_dict = _raw_specs(protocol, host, version) + headers = ujson.loads(headers_json) + if not spec_dict["paths"]: url = f"{protocol}://{host}" origin_url = f"{url}/apispec.json" @@ -664,7 +666,6 @@ def _load(protocol, host, headers_json, project, version): return swagger_spec # retrieve list of projects accessible to user - headers = ujson.loads(headers_json) query = {"name": project} if project else {} query["_fields"] = ["name"] url = f"{protocol}://{host}"