Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Poetry #40

Merged
merged 26 commits into from
Dec 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,21 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10", "3.11"]
python-version: ["3.10", "3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest
pip install -r requirements/install.txt
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with ruff
uses: astral-sh/ruff-action@v2
with:
src: "./aws_log_parser"
- name: Test with pytest
run: |
pip install -r requirements/test.txt
pytest
pip install pipx
pipx install poetry
poetry install
poetry run pytest
14 changes: 5 additions & 9 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,12 @@ repos:
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/pycqa/flake8
rev: 6.0.0
hooks:
- id: flake8

- repo: https://github.com/ambv/black
rev: 23.1.0
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.4
hooks:
- id: black
language_version: python3.10
- id: ruff
- id: ruff-format

- repo: local
hooks:
Expand Down
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
3.0.0
-----

* Use poetry

* Added `regex_filter`. Filters files using a regex.

* Fixed issue when listing mutliple files in a directory.

* Gzip is supported for local and remote files.

2.4.1
----

* fix: Add missing ALB auth error: AuthMissingAWSALBAuthNonce

* fix: LoadBalancerLogEntry not always provides targetGroup ARN

* fix: LoadBalancerLogEntry tests fix

Thanks @pkoltermann!

1.8.3
-----

Expand Down
12 changes: 10 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ schemes are supported.
GZipped LoadBalancer logs are supported by passing `file_suffix=".gz"` to
the AwsLogParser initilizer.

You can filter files based on a regex by passing in `regex_filter` to the
AwsLogParser. For example:

parser = AwsLogParser(
log_type=LogType.CloudFront,
regex_filter='20241226',
)

S3:

```python
Expand Down Expand Up @@ -272,5 +280,5 @@ See https://github.com/dpetzold/aws-log-parser/blob/master/aws_log_parser/models

## Development

Run `bootstrap.sh` to create the virtualenv. The tests can be run with `python
setup.py test` or by running `pytest` directly.
poetry install
poetry run pytest
23 changes: 14 additions & 9 deletions aws_log_parser/aws/s3.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import gzip
import re

from dataclasses import dataclass
from io import BytesIO

from .client import (
AwsClient,
AwsService,
)
from ..io import FileIterator


@dataclass
Expand All @@ -26,19 +28,22 @@ def list_files(self, bucket, prefix, sort_key, reverse=True):

return sorted(items, key=lambda x: x[sort_key], reverse=reverse)

def read_key(self, bucket, key, endswith=None):
def read_key(self, bucket, key):
if self.aws_client.verbose:
print(f"Reading s3://{bucket}/{key}")
contents = self.client.get_object(Bucket=bucket, Key=key)
if endswith == ".gz":
with gzip.GzipFile(fileobj=contents["Body"]) as _gz:
yield from [line for line in _gz.read().decode("utf-8").splitlines()]
else:
yield from [line.decode("utf-8") for line in contents["Body"].iter_lines()]
yield from FileIterator(
fileobj=BytesIO(contents["Body"].iter_lines()),
gzipped=key.endswith(".gz"),
)

def read_keys(self, bucket, prefix, endswith=None):
def read_keys(self, bucket, prefix, endswith=None, regex_filter=None):
reo = re.compile(regex_filter) if regex_filter else None
for file in self.list_files(bucket, prefix, "LastModified"):
if endswith and not file["Key"].endswith(endswith):
continue

yield from self.read_key(bucket, file["Key"], endswith)
if reo and not reo.match(file):
continue

yield from self.read_key(bucket, file["Key"])
32 changes: 23 additions & 9 deletions aws_log_parser/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import typing
import importlib
import importlib.util
import re
import sys

from dataclasses import dataclass, fields, field
from pathlib import Path
from urllib.parse import urlparse

from .aws import AwsClient
from .io import FileIterator
from .models import (
LogFormat,
LogFormatType,
Expand All @@ -26,6 +28,7 @@ class AwsLogParser:
region: typing.Optional[str] = None
profile: typing.Optional[str] = None
file_suffix: str = ".log"
regex_filter: typing.Optional[str] = None
verbose: bool = False

plugin_paths: typing.List[typing.Union[str, Path]] = field(default_factory=list)
Expand Down Expand Up @@ -98,10 +101,11 @@ def read_file(self, path):
:return: Parsed log entries.
:rtype: Dependant on log_type.
"""
if not isinstance(path, Path):
path = Path(path)
if self.verbose:
print(f"Reading file://{path}")
with open(path) as log_data:
yield from self.parse(log_data.readlines())
yield from self.parse(FileIterator(path, gzipped=path.suffix == ".gz"))

def read_files(self, pathname):
"""
Expand All @@ -113,12 +117,18 @@ def read_files(self, pathname):
:return: Parsed log entries.
:rtype: Dependant on log_type.
"""
path = Path(pathname)
if path.is_file():
yield from self.read_file(path)
base_path = Path(pathname)
if base_path.is_dir():
if self.regex_filter:
reo = re.compile(self.regex_filter)
for path in base_path.iterdir():
if reo.match(path.name) and path.is_file():
yield from self.read_file(path)
else:
for path in base_path.glob(f"**/*{self.file_suffix}"):
yield from self.read_file(path)
else:
for p in path.glob(f"**/*{self.file_suffix}"):
yield from self.read_file(p)
yield from self.read_file(base_path)

def read_s3(self, bucket, prefix, endswith=None):
"""
Expand All @@ -133,7 +143,12 @@ def read_s3(self, bucket, prefix, endswith=None):
:rtype: Dependant on log_type.
"""
yield from self.parse(
self.aws_client.s3_service.read_keys(bucket, prefix, endswith=endswith)
self.aws_client.s3_service.read_keys(
bucket,
prefix,
endswith=endswith if endswith else self.file_suffix,
regex_filter=self.regex_filter,
)
)

def read_url(self, url):
Expand Down Expand Up @@ -165,7 +180,6 @@ def read_url(self, url):
yield from self.read_s3(
parsed.netloc,
parsed.path.lstrip("/"),
endswith=self.file_suffix,
)
else:
raise ValueError(f"Unknown scheme {parsed.scheme}")
46 changes: 46 additions & 0 deletions aws_log_parser/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
import io
import gzip
import typing


@dataclass
class FileIterator:
path: typing.Optional[Path] = None
fileobj: typing.Optional[io.IOBase] = None
gzipped: bool = False

def yield_gzipped(self, fh):
yield from [line for line in fh.read().decode("utf-8").splitlines()]

def yield_plain(self, fh):
yield from [line.decode("utf-8") for line in fh]

@contextmanager
def open_path(self):
assert self.path
fh = self.path.open("rb")
try:
yield fh
finally:
fh.close()

@contextmanager
def open_gzip(self):
if self.fileobj:
yield gzip.GzipFile(fileobj=self.fileobj)
else:
with self.open_path() as fh:
yield gzip.GzipFile(fileobj=fh)

def __iter__(self):
yield_func = self.yield_gzipped if self.gzipped else self.yield_plain
open_func = self.open_gzip if self.gzipped else self.open_path

if not self.gzipped and self.fileobj:
yield from yield_func(self.fileobj)
else:
with open_func() as fh:
yield from yield_func(fh)
11 changes: 7 additions & 4 deletions aws_log_parser/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,10 @@ class WafLogEntry(LogEntry):
timestamp: datetime.datetime = field(
metadata=config(
encoder=lambda t: datetime.datetime.timestamp(t) * 1000,
decoder=lambda t: datetime.datetime.utcfromtimestamp(t / 1000),
decoder=lambda t: datetime.datetime.fromtimestamp(
t / 1000,
datetime.timezone.utc,
),
)
)
formatVersion: int
Expand All @@ -271,9 +274,9 @@ class WafLogEntry(LogEntry):
httpRequest: WafLogEntryHttpRequest
ruleGroupList: typing.List[WafLogEntryRuleGroup] = field(default_factory=list)
rateBasedRuleList: typing.List[WafLogEntryRateGroup] = field(default_factory=list)
nonTerminatingMatchingRules: typing.List[
WafLogEntryNonTerminatingMatchingRule
] = field(default_factory=list)
nonTerminatingMatchingRules: typing.List[WafLogEntryNonTerminatingMatchingRule] = (
field(default_factory=list)
)

@property
def client_ip(self):
Expand Down
25 changes: 0 additions & 25 deletions bootstrap.sh

This file was deleted.

52 changes: 0 additions & 52 deletions examples/count-hosts-gzip-alb.py

This file was deleted.

Loading
Loading