Merge pull request #34 from westandskif/optimize/agg

Optimize/agg
westandskif · Aug 22, 2024 · 21b2821 · 21b2821
2 parents 710dd25 + cb64b67
commit 21b2821
Show file tree

Hide file tree

Showing 108 changed files with 2,378 additions and 982 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -229,7 +229,7 @@ class-rgx=^_?[A-Z][a-zA-Z0-9]*_?$
 module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
 
 # Regular expression matching correct method names
-method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|visit_.+|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
 
 # Regular expression which should only match function or class names that do
 # not require a docstring.

diff --git a/Makefile b/Makefile
@@ -47,3 +47,10 @@ test-py%:
 		"pip install -e . && pytest"
 
 test: test-py3.6 test-py3.7 test-py3.8 test-py3.9 test-py3.10 test-py3.11 test-py3.12
+
+benchmark-py%:
+	~/.pyenv/versions/convtools-$*/bin/pip install -e .
+	~/.pyenv/versions/convtools-$*/bin/pip install tabulate
+	~/.pyenv/versions/convtools-$*/bin/python run_benchmarks.py
+
+benchmarks: benchmark-py3.7 benchmark-py3.8 benchmark-py3.9 benchmark-py3.10 benchmark-py3.11 benchmark-py3.12 benchmark-py3.13
diff --git a/benchmarks/benchmark_results_V1.csv b/benchmarks/benchmark_results_V1.csv
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -19,7 +19,7 @@ def gen_converter(self):
             {
                 "a": c.ReduceFuncs.Sum(c.item("value_1")),
                 "b": c.ReduceFuncs.Min(c.item("value_2")),
-                "c": c.ReduceFuncs.Max(c.item("value_3")),
+                "c": c.ReduceFuncs.Max(c.item("value_2")),
             }
         ).gen_converter()
 
@@ -35,10 +35,10 @@ def f(data):
                 ):
                     b = i["value_2"]
 
-                if i["value_3"] is not None and (
-                    c is None or c < i["value_3"]
+                if i["value_2"] is not None and (
+                    c is None or c < i["value_2"]
                 ):
-                    c = i["value_3"]
+                    c = i["value_2"]
             return {
                 "a": a,
                 "b": b,
@@ -49,8 +49,7 @@ def f(data):
 
     def gen_data(self):
         return [
-            {"value_1": random(), "value_2": random(), "value_3": random()}
-            for i in range(10000)
+            {"value_1": random(), "value_2": random()} for i in range(10000)
         ]
 
 

diff --git a/benchmarks/build_heuristics.py b/benchmarks/build_heuristics.py
@@ -15,7 +15,12 @@ def print_new_weights():  # pragma: no cover
         "LIST_INIT": SimpleTimer("[1,2,3,4,5]"),
         "SET_INIT": SimpleTimer("{1,2,3,4,5}"),
         "DICT_INIT": SimpleTimer("{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}"),
+        "FUNCTION_CALL": SimpleTimer("f(v, 1)", "v=1\ndef f(v, x): return x"),
     }
+    print("# WARMING UP")
+    for i in range(2):
+        SimpleTimer.get_base_time()
+
     print("# CALCULATING BASE TIME")
     base_time = SimpleTimer.get_base_time() / 100.0
 
@@ -34,7 +39,6 @@ def print_new_weights():  # pragma: no cover
         print(f"    {name} = {it}")
         max_it = max(max_it, it)
 
-    print("    FUNCTION_CALL = DICT_LOOKUP * 4")
     print(f"    UNPREDICTABLE = {max_it * 100}")
 
 

diff --git a/benchmarks/main_versions.txt b/benchmarks/main_versions.txt
@@ -1,3 +1,4 @@
 0.19.0
 0.38.0
 1.1.2
+1.13.0
diff --git a/benchmarks/storage.py b/benchmarks/storage.py
@@ -7,16 +7,16 @@
 from itertools import chain
 from time import time
 
-from pydantic import BaseModel
 
 import convtools
 from convtools import conversion as c
 from convtools.contrib.tables import Table
 
 from .timer import SimpleTimer
+from typing import NamedTuple
 
 
-class Environment(BaseModel):
+class Environment(NamedTuple):
     system: str
     arch: str
     py_version: str
@@ -26,7 +26,14 @@ class Environment(BaseModel):
     convtools_version: str
 
 
-class BenchmarkResult(Environment):
+class BenchmarkResult(NamedTuple):
+    system: str
+    arch: str
+    py_version: str
+    py_version_exact: str
+    py_implementation: str
+    py_compiler: str
+    convtools_version: str
     name: str
     diff: float
 
@@ -58,9 +65,10 @@ def _measure(self, f, data):
     def compare_results(self, data1, data2) -> bool:
         return data1 == data2
 
-    def get_execution_result(self) -> BenchmarkResult:
+    def get_execution_result(self, silent=False) -> BenchmarkResult:
         name = self.get_name()
-        print(f"TESTING: {name}")
+        if not silent:
+            print(f"TESTING: {name}")
         data = self.gen_data()
 
         convtools_converter = self.gen_converter()
@@ -79,7 +87,7 @@ def get_execution_result(self) -> BenchmarkResult:
         return BenchmarkResult(
             name=name,
             diff=best_naive / convtools_time,
-            **ENVIRONMENT.model_dump(),
+            **ENVIRONMENT._asdict(),
         )
 
     @abc.abstractmethod
@@ -107,10 +115,10 @@ def load_results(self):
         if os.path.exists(self.FILENAME):
             return list(
                 map(
-                    BenchmarkResult.model_validate,
-                    Table.from_csv(self.FILENAME, header=True).into_iter_rows(
-                        dict
-                    ),
+                    BenchmarkResult._make,
+                    Table.from_csv(self.FILENAME, header=True)
+                    .update(diff=c.col("diff").as_type(float))
+                    .into_iter_rows(tuple),
                 )
             )
         return []
@@ -133,7 +141,7 @@ def save(self):
         merged_results = (
             c.group_by(*(c.attr(key_) for key_ in self._key_fields))
             .aggregate(c.ReduceFuncs.Last(c.this))
-            .iter(c.this.call_method("model_dump"))
+            .iter(c.this.call_method("_asdict"))
             .execute(chain(self.load_results(), self.new_results))
         )
         resulting_rows = sorted(

diff --git a/benchmarks/timer.py b/benchmarks/timer.py
@@ -8,11 +8,11 @@ class SimpleTimer(Timer):
     """A simpler timer which allows to automatically measure time taken to get
     stable results based on coefficient of variation (sigma to mean ratio)"""
 
-    REL_PRECISION = 0.005
+    REL_PRECISION = 0.015
 
     def auto_measure(
         self,
-        max_time=10,
+        max_time=20,
         min_time=0.02,
         rel_precision=REL_PRECISION,
         expected_num_of_checks=40,
@@ -57,8 +57,8 @@ def auto_measure(
 
         while True:
             mean = sum(times) / checks
-            std_deviation = (
-                sqrt(sum(d * d for d in (mean - t for t in times))) / checks
+            std_deviation = sqrt(
+                sum(d * d for d in (mean - t for t in times)) / checks
             )
             ratio = std_deviation / mean
             if ratio < std_deviation_to_mean_ratio:

diff --git a/build-docs-performance.py b/build-docs-performance.py
@@ -1,10 +1,11 @@
 import os
-from glob import glob
 import subprocess
+from glob import glob
 from hashlib import sha256
 
-from convtools.contrib.tables import Table
 from convtools import conversion as c
+from convtools.contrib.tables import Table
+
 
 DOCS_ROOT = "./docs"
 MD_DIR = os.path.join(DOCS_ROOT, "performance-md")
@@ -21,17 +22,25 @@ def ensure_dir(file_path):
 
 
 from typing import List
-from benchmarks.storage import BenchmarkResultsStorage, BenchmarkResult
+
+from benchmarks.storage import BenchmarkResult, BenchmarkResultsStorage
 from tabulate import tabulate
 
+import convtools
+
 
 def gen_md(results: List[BenchmarkResult], py_version: str, indent="    "):
     filtered_results = (
-        c.filter(c.attr("py_version") == py_version)
+        c.filter(
+            c.and_(
+                c.attr("py_version") == py_version,
+                c.attr("convtools_version") == convtools.__version__,
+            )
+        )
         .pipe(
             c.group_by(c.attr("name"))
             .aggregate(
-                c.ReduceFuncs.MinRow(c.attr("diff")).call_method("model_dump")
+                c.ReduceFuncs.MinRow(c.attr("diff")).call_method("_asdict")
             )
             .sort(key=lambda x: x["diff"])
         )
@@ -57,9 +66,11 @@ def gen_md(results: List[BenchmarkResult], py_version: str, indent="    "):
 
 if __name__ == "__main__":
     benchmark_results = BenchmarkResultsStorage().load_results()
+    gen_md(benchmark_results, "3.6")
     gen_md(benchmark_results, "3.7")
     gen_md(benchmark_results, "3.8")
     gen_md(benchmark_results, "3.9")
     gen_md(benchmark_results, "3.10")
     gen_md(benchmark_results, "3.11")
     gen_md(benchmark_results, "3.12")
+    gen_md(benchmark_results, "3.13")
diff --git a/ci-requirements/requirements3.10.in b/ci-requirements/requirements3.10.in
@@ -18,5 +18,4 @@ mkdocs-material
 pygments
 pymdown-extensions
 
-pydantic
 tabulate
diff --git a/ci-requirements/requirements3.10.out b/ci-requirements/requirements3.10.out
@@ -1,21 +1,20 @@
-annotated-types==0.7.0
-astroid==3.2.3
-Babel==2.15.0
-black==24.4.2
+astroid==3.2.4
+babel==2.16.0
+black==24.8.0
 build==1.2.1
 certifi==2024.7.4
 charset-normalizer==3.3.2
 click==8.1.7
 colorama==0.4.6
-coverage==7.6.0
+coverage==7.6.1
 dill==0.3.8
 exceptiongroup==1.2.2
 ghp-import==2.1.0
 idna==3.7
 iniconfig==2.0.0
 isort==5.13.2
 Jinja2==3.1.4
-Markdown==3.6
+Markdown==3.7
 markdown-include==0.8.1
 MarkupSafe==2.1.5
 mccabe==0.7.0
@@ -24,35 +23,33 @@ mergedeep==1.3.4
 mkdocs==1.6.0
 mkdocs-exclude==1.0.2
 mkdocs-get-deps==0.2.0
-mkdocs-material==9.5.29
+mkdocs-material==9.5.32
 mkdocs-material-extensions==1.3.1
-mypy==1.10.1
+mypy==1.11.1
 mypy-extensions==1.0.0
 packaging==24.1
 paginate==0.5.6
 pathspec==0.12.1
 platformdirs==4.2.2
 pluggy==1.5.0
 py-cpuinfo==9.0.0
-pydantic==2.8.2
-pydantic_core==2.20.1
 Pygments==2.18.0
-pylint==3.2.5
-pymdown-extensions==10.8.1
+pylint==3.2.6
+pymdown-extensions==10.9
 pyproject_hooks==1.1.0
-pytest==8.2.2
+pytest==8.3.2
 pytest-benchmark==4.0.0
 pytest-cov==5.0.0
 python-dateutil==2.9.0.post0
-PyYAML==6.0.1
+PyYAML==6.0.2
 pyyaml_env_tag==0.1
-regex==2024.5.15
+regex==2024.7.24
 requests==2.32.3
-ruff==0.5.2
+ruff==0.6.1
 six==1.16.0
 tabulate==0.9.0
 tomli==2.0.1
-tomlkit==0.13.0
+tomlkit==0.13.2
 typing_extensions==4.12.2
 urllib3==2.2.2
-watchdog==4.0.1
+watchdog==4.0.2
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 1.13.0 (2024-08-22)
+
+- sped up aggregations by CSE (common subexpression elimination)
+
 ## 1.12.2 (2024-08-22)
 
 - fixed `c.this.datetime_parse` with default when uncovered data remains

diff --git a/docs/benefits.md b/docs/benefits.md
@@ -47,30 +47,38 @@ other parts of your code. The below table provides the speed-up of
 convtools-based solutions over the naive ones. 
 
 
-/// tab | py3.12
+/// tab | 3.13
+{! performance-md/perf-3.13.md !}
+///
+
+/// tab | 3.12
 {! performance-md/perf-3.12.md !}
 ///
 
-/// tab | py3.11
+/// tab | 3.11
 {! performance-md/perf-3.11.md !}
 ///
 
-/// tab | py3.10
+/// tab | 3.10
 {! performance-md/perf-3.10.md !}
 ///
 
-/// tab | py3.9
+/// tab | 3.9
 {! performance-md/perf-3.9.md !}
 ///
 
-/// tab | py3.8
+/// tab | 3.8
 {! performance-md/perf-3.8.md !}
 ///
 
-/// tab | py3.7
+/// tab | 3.7
 {! performance-md/perf-3.7.md !}
 ///
 
+/// tab | 3.6
+{! performance-md/perf-3.6.md !}
+///
+
 
 In cases where there are multiple speed test results, the worst is
 taken. See [benchmarks on Github](https://github.com/westandskif/convtools/blob/master/run_benchmarks.py) for source code.
diff --git a/docs/examples-md/api__and_then.md b/docs/examples-md/api__and_then.md
@@ -25,16 +25,16 @@ assert converter(range(3)) == [10, 1, 12]
 
 /// tab | debug stdout
 ```python
-def converter(data_):
+def _converter(data_):
     try:
-        return [(i and int(i)) for i in data_]
+        return [(_i and int(_i)) for _i in data_]
     except __exceptions_to_dump_sources:
         __convtools__code_storage.dump_sources()
         raise
 
-def converter(data_):
+def _converter(data_):
     try:
-        return [(((i + 10) if (i != 1) else i)) for i in data_]
+        return [(((_i + 10) if (_i != 1) else _i)) for _i in data_]
     except __exceptions_to_dump_sources:
         __convtools__code_storage.dump_sources()
         raise