dpetzold · dpetzold · Dec 28, 2024 · Dec 13, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -8,26 +8,21 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install flake8 pytest
-          pip install -r requirements/install.txt
-      - name: Lint with flake8
-        run: |
-          # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+      - name: Lint with ruff
+        uses: astral-sh/ruff-action@v2
+        with:
+          src: "./aws_log_parser"
       - name: Test with pytest
         run: |
-          pip install -r requirements/test.txt
-          pytest
+          pip install pipx
+          pipx install poetry
+          poetry install
+          poetry run pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,16 +10,12 @@ repos:
   - id: end-of-file-fixer
   - id: trailing-whitespace
 
-- repo: https://github.com/pycqa/flake8
-  rev: 6.0.0
-  hooks:
-  - id: flake8
-
-- repo: https://github.com/ambv/black
-  rev: 23.1.0
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.8.4
   hooks:
-  - id: black
-    language_version: python3.10
+    - id: ruff
+    - id: ruff-format
 
 - repo: local
   hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,25 @@
+3.0.0
+-----
+
+* Use poetry
+
+* Added `regex_filter`. Filters files using a regex.
+
+* Fixed issue when listing mutliple files in a directory.
+
+* Gzip is supported for local and remote files.
+
+2.4.1
+----
+
+* fix:  Add missing ALB auth error: AuthMissingAWSALBAuthNonce
+
+* fix: LoadBalancerLogEntry not always provides targetGroup ARN
+
+* fix: LoadBalancerLogEntry tests fix
+
+Thanks @pkoltermann!
+
 1.8.3
 -----
 

diff --git a/README.md b/README.md
@@ -66,6 +66,14 @@ schemes are supported.
 GZipped LoadBalancer logs are supported by passing `file_suffix=".gz"` to
 the AwsLogParser initilizer.
 
+You can filter files based on a regex by passing in `regex_filter` to the
+AwsLogParser. For example:
+
+    parser = AwsLogParser(
+        log_type=LogType.CloudFront,
+        regex_filter='20241226',
+    )
+
 S3:
 
 ```python
@@ -272,5 +280,5 @@ See https://github.com/dpetzold/aws-log-parser/blob/master/aws_log_parser/models
 
 ## Development
 
-Run `bootstrap.sh` to create the virtualenv. The tests can be run with `python
-setup.py test` or by running `pytest` directly.
+poetry install
+poetry run pytest
diff --git a/aws_log_parser/aws/s3.py b/aws_log_parser/aws/s3.py
@@ -1,11 +1,13 @@
-import gzip
+import re
 
 from dataclasses import dataclass
+from io import BytesIO
 
 from .client import (
     AwsClient,
     AwsService,
 )
+from ..io import FileIterator
 
 
 @dataclass
@@ -26,19 +28,22 @@ def list_files(self, bucket, prefix, sort_key, reverse=True):
 
         return sorted(items, key=lambda x: x[sort_key], reverse=reverse)
 
-    def read_key(self, bucket, key, endswith=None):
+    def read_key(self, bucket, key):
         if self.aws_client.verbose:
             print(f"Reading s3://{bucket}/{key}")
         contents = self.client.get_object(Bucket=bucket, Key=key)
-        if endswith == ".gz":
-            with gzip.GzipFile(fileobj=contents["Body"]) as _gz:
-                yield from [line for line in _gz.read().decode("utf-8").splitlines()]
-        else:
-            yield from [line.decode("utf-8") for line in contents["Body"].iter_lines()]
+        yield from FileIterator(
+            fileobj=BytesIO(contents["Body"].iter_lines()),
+            gzipped=key.endswith(".gz"),
+        )
 
-    def read_keys(self, bucket, prefix, endswith=None):
+    def read_keys(self, bucket, prefix, endswith=None, regex_filter=None):
+        reo = re.compile(regex_filter) if regex_filter else None
         for file in self.list_files(bucket, prefix, "LastModified"):
             if endswith and not file["Key"].endswith(endswith):
                 continue
 
-            yield from self.read_key(bucket, file["Key"], endswith)
+            if reo and not reo.match(file):
+                continue
+
+            yield from self.read_key(bucket, file["Key"])
diff --git a/aws_log_parser/interface.py b/aws_log_parser/interface.py
@@ -2,13 +2,15 @@
 import typing
 import importlib
 import importlib.util
+import re
 import sys
 
 from dataclasses import dataclass, fields, field
 from pathlib import Path
 from urllib.parse import urlparse
 
 from .aws import AwsClient
+from .io import FileIterator
 from .models import (
     LogFormat,
     LogFormatType,
@@ -26,6 +28,7 @@ class AwsLogParser:
     region: typing.Optional[str] = None
     profile: typing.Optional[str] = None
     file_suffix: str = ".log"
+    regex_filter: typing.Optional[str] = None
     verbose: bool = False
 
     plugin_paths: typing.List[typing.Union[str, Path]] = field(default_factory=list)
@@ -98,10 +101,11 @@ def read_file(self, path):
         :return: Parsed log entries.
         :rtype: Dependant on log_type.
         """
+        if not isinstance(path, Path):
+            path = Path(path)
         if self.verbose:
             print(f"Reading file://{path}")
-        with open(path) as log_data:
-            yield from self.parse(log_data.readlines())
+        yield from self.parse(FileIterator(path, gzipped=path.suffix == ".gz"))
 
     def read_files(self, pathname):
         """
@@ -113,12 +117,18 @@ def read_files(self, pathname):
         :return: Parsed log entries.
         :rtype: Dependant on log_type.
         """
-        path = Path(pathname)
-        if path.is_file():
-            yield from self.read_file(path)
+        base_path = Path(pathname)
+        if base_path.is_dir():
+            if self.regex_filter:
+                reo = re.compile(self.regex_filter)
+                for path in base_path.iterdir():
+                    if reo.match(path.name) and path.is_file():
+                        yield from self.read_file(path)
+            else:
+                for path in base_path.glob(f"**/*{self.file_suffix}"):
+                    yield from self.read_file(path)
         else:
-            for p in path.glob(f"**/*{self.file_suffix}"):
-                yield from self.read_file(p)
+            yield from self.read_file(base_path)
 
     def read_s3(self, bucket, prefix, endswith=None):
         """
@@ -133,7 +143,12 @@ def read_s3(self, bucket, prefix, endswith=None):
         :rtype: Dependant on log_type.
         """
         yield from self.parse(
-            self.aws_client.s3_service.read_keys(bucket, prefix, endswith=endswith)
+            self.aws_client.s3_service.read_keys(
+                bucket,
+                prefix,
+                endswith=endswith if endswith else self.file_suffix,
+                regex_filter=self.regex_filter,
+            )
         )
 
     def read_url(self, url):
@@ -165,7 +180,6 @@ def read_url(self, url):
             yield from self.read_s3(
                 parsed.netloc,
                 parsed.path.lstrip("/"),
-                endswith=self.file_suffix,
             )
         else:
             raise ValueError(f"Unknown scheme {parsed.scheme}")
diff --git a/aws_log_parser/io.py b/aws_log_parser/io.py
@@ -0,0 +1,46 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+import io
+import gzip
+import typing
+
+
+@dataclass
+class FileIterator:
+    path: typing.Optional[Path] = None
+    fileobj: typing.Optional[io.IOBase] = None
+    gzipped: bool = False
+
+    def yield_gzipped(self, fh):
+        yield from [line for line in fh.read().decode("utf-8").splitlines()]
+
+    def yield_plain(self, fh):
+        yield from [line.decode("utf-8") for line in fh]
+
+    @contextmanager
+    def open_path(self):
+        assert self.path
+        fh = self.path.open("rb")
+        try:
+            yield fh
+        finally:
+            fh.close()
+
+    @contextmanager
+    def open_gzip(self):
+        if self.fileobj:
+            yield gzip.GzipFile(fileobj=self.fileobj)
+        else:
+            with self.open_path() as fh:
+                yield gzip.GzipFile(fileobj=fh)
+
+    def __iter__(self):
+        yield_func = self.yield_gzipped if self.gzipped else self.yield_plain
+        open_func = self.open_gzip if self.gzipped else self.open_path
+
+        if not self.gzipped and self.fileobj:
+            yield from yield_func(self.fileobj)
+        else:
+            with open_func() as fh:
+                yield from yield_func(fh)
diff --git a/aws_log_parser/models.py b/aws_log_parser/models.py
@@ -258,7 +258,10 @@ class WafLogEntry(LogEntry):
     timestamp: datetime.datetime = field(
         metadata=config(
             encoder=lambda t: datetime.datetime.timestamp(t) * 1000,
-            decoder=lambda t: datetime.datetime.utcfromtimestamp(t / 1000),
+            decoder=lambda t: datetime.datetime.fromtimestamp(
+                t / 1000,
+                datetime.timezone.utc,
+            ),
         )
     )
     formatVersion: int
@@ -271,9 +274,9 @@ class WafLogEntry(LogEntry):
     httpRequest: WafLogEntryHttpRequest
     ruleGroupList: typing.List[WafLogEntryRuleGroup] = field(default_factory=list)
     rateBasedRuleList: typing.List[WafLogEntryRateGroup] = field(default_factory=list)
-    nonTerminatingMatchingRules: typing.List[
-        WafLogEntryNonTerminatingMatchingRule
-    ] = field(default_factory=list)
+    nonTerminatingMatchingRules: typing.List[WafLogEntryNonTerminatingMatchingRule] = (
+        field(default_factory=list)
+    )
 
     @property
     def client_ip(self):

diff --git a/bootstrap.sh b/bootstrap.sh
diff --git a/examples/count-hosts-gzip-alb.py b/examples/count-hosts-gzip-alb.py