diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3db4457ca..64be2a0b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,9 @@ ci: skip: - uv-lock +default_language_version: + python: python3.13 + repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 @@ -65,3 +68,8 @@ repos: hooks: - id: uv-lock - id: uv-sync + +- repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell diff --git a/CHANGELOG.md b/CHANGELOG.md index 457f5a34e..52d6a75e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#2482](https://github.com/meltano/sdk/issues/2482) Allow SQL tap developers to auto-skip certain schemas from discovery - [#2784](https://github.com/meltano/sdk/issues/2784) Added a new built-in setting `activate_version` for targets to optionally disable processing of `ACTIVATE_VERSION` messages - [#2780](https://github.com/meltano/sdk/issues/2780) Numeric values are now parsed as `decimal.Decimal` in REST and GraphQL stream responses -- [#2775](https://github.com/meltano/sdk/issues/2775) Log a stream's bookmark (if it's avaiable) when its sync starts +- [#2775](https://github.com/meltano/sdk/issues/2775) Log a stream's bookmark (if it's available) when its sync starts - [#2703](https://github.com/meltano/sdk/issues/2703) Targets now emit record count from the built-in batch file processor - [#2774](https://github.com/meltano/sdk/issues/2774) Accept a `maxLength` limit for VARCHARs - [#2769](https://github.com/meltano/sdk/issues/2769) Add `versioning-strategy` to dependabot config of Cookiecutter templates @@ -210,7 +210,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### ✨ New - [#2432](https://github.com/meltano/sdk/issues/2432) Developers can now customize the default logging configuration for their taps/targets by adding `default_logging.yml` to their package -- [#2531](https://github.com/meltano/sdk/issues/2531) The `json` module is now avaiable to stream maps -- _**Thanks @grigi!**_ +- [#2531](https://github.com/meltano/sdk/issues/2531) The `json` module is now available to stream maps -- _**Thanks @grigi!**_ - [#2529](https://github.com/meltano/sdk/issues/2529) Stream sync context is now available to all instances methods as a `Stream.context` attribute ### 🐛 Fixes @@ -330,7 +330,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### 📚 Documentation Improvements - [#2239](https://github.com/meltano/sdk/issues/2239) Linked reference docs to source code -- [#2231](https://github.com/meltano/sdk/issues/2231) Added an example implemetation of JSON schema validation that uses `fastjsonschema` +- [#2231](https://github.com/meltano/sdk/issues/2231) Added an example implementation of JSON schema validation that uses `fastjsonschema` - [#2219](https://github.com/meltano/sdk/issues/2219) Added reference docs for tap & target testing helpers ## v0.35.0 (2024-02-02) @@ -748,7 +748,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### ✨ New -- [#1262](https://github.com/meltano/sdk/issues/1262) Support string `"__NULL__"` whereever null values are allowed in stream maps configuration +- [#1262](https://github.com/meltano/sdk/issues/1262) Support string `"__NULL__"` wherever null values are allowed in stream maps configuration ### 🐛 Fixes @@ -1286,7 +1286,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changes -- Target SDK: Improved performance for Batch Sinks by skipping extra drain operations when newly recieved STATE messages are unchanged from the prior received STATE (#172, !125) -- _Thanks, **[Pat Nadolny](https://gitlab.com/pnadolny13)**!_ +- Target SDK: Improved performance for Batch Sinks by skipping extra drain operations when newly received STATE messages are unchanged from the prior received STATE (#172, !125) -- _Thanks, **[Pat Nadolny](https://gitlab.com/pnadolny13)**!_ ### Fixes diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index c2140912d..58633dec5 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -161,7 +161,7 @@ Sphinx will automatically generate class stubs, so be sure to `git add` them. ## Semantic Pull Requests -This repo uses the [semantic-prs](https://github.com/Ezard/semantic-prs) GitHub app to check all PRs againts the conventional commit syntax. +This repo uses the [semantic-prs](https://github.com/Ezard/semantic-prs) GitHub app to check all PRs against the conventional commit syntax. Pull requests should be named according to the conventional commit syntax to streamline changelog and release notes management. We encourage (but do not require) the use of conventional commits in commit messages as well. diff --git a/docs/dev_guide.md b/docs/dev_guide.md index cc7cd9004..43563aa3f 100644 --- a/docs/dev_guide.md +++ b/docs/dev_guide.md @@ -291,7 +291,7 @@ That command will produce a `result.json` file which you can explore with the `v $ poetry run vizviewer result.json ``` -Thet output should look like this +The output should look like this ![SDK Flame Graph](https://gitlab.com/meltano/sdk/uploads/07633ba1217de6eb1bb0e018133c608d/_write_record_message.png) diff --git a/docs/implementation/at_least_once.md b/docs/implementation/at_least_once.md index 8643202f9..c7b05fed6 100644 --- a/docs/implementation/at_least_once.md +++ b/docs/implementation/at_least_once.md @@ -18,7 +18,7 @@ According to the Singer spec, bookmark comparisons are performed on the basis of [Replication Key Signposts](./state.md#replication-key-signposts) are an internal and automatic feature of the SDK. Signposts are necessary in order to deliver the 'at least once' delivery promise for unsorted streams and parent-child streams. The function of a signpost is to ensure that bookmark keys do not advance past a point where we may have not synced all records, such as for unsorted or reverse-sorted streams. This feature also enables developers to override `state_partitioning_key`, which reduces the number of bookmarks needed to track state on parent-child streams with a large number of parent records. -In all applications, the signpost prevents the bookmark's value from advancing too far and prevents records from being skipped in future sync operations. We _intentionally_ do not advance the bookmark as far as the max replication key value from all records we've synced, with the knowlege that _some_ records with equal or lower replication key values may have not yet been synced. It follows then, that any records whose replication key is greater than the signpost value will necessarily be re-synced in the next execution, causing some amount of record duplication downstream. +In all applications, the signpost prevents the bookmark's value from advancing too far and prevents records from being skipped in future sync operations. We _intentionally_ do not advance the bookmark as far as the max replication key value from all records we've synced, with the knowledge that _some_ records with equal or lower replication key values may have not yet been synced. It follows then, that any records whose replication key is greater than the signpost value will necessarily be re-synced in the next execution, causing some amount of record duplication downstream. ### Cause #3: Stream interruption @@ -32,11 +32,11 @@ There are two generally recommended approaches for dealing with record duplicati Assuming that a primary key exists, most target implementation will simply use the primary key to merge newly received records with their prior versions, eliminating any risk of duplication in the destination dataset. -However, this approach will not work for streams that lack primary keys or in implentations running in pure 'append only' mode. For these cases, some amount of record duplication should be expected and planned for by the end user. +However, this approach will not work for streams that lack primary keys or in implementations running in pure 'append only' mode. For these cases, some amount of record duplication should be expected and planned for by the end user. ### Strategy #2: Removing duplicates using `dbt` transformations -For cases where the destination table _does not_ use primary keys, the most common way of resolving duplicates after they've landed in the downstream dataset is to apply a `ROW_NUMBER()` function in a tool like [dbt](https://www.getdbt.com). The `ROW_NUMBER()` function can caculate a `dedupe_rank` and/or a `recency_rank` in the transformation layer, and then downstream queries can easily filter out any duplicates using the calculated rank. Users can write these transformations by hand or leverage the [deduplicate-source](https://github.com/dbt-labs/dbt-utils#deduplicate-source) macro from the [dbt-utils](https://github.com/dbt-labs/dbt-utils) package. +For cases where the destination table _does not_ use primary keys, the most common way of resolving duplicates after they've landed in the downstream dataset is to apply a `ROW_NUMBER()` function in a tool like [dbt](https://www.getdbt.com). The `ROW_NUMBER()` function can calculate a `dedupe_rank` and/or a `recency_rank` in the transformation layer, and then downstream queries can easily filter out any duplicates using the calculated rank. Users can write these transformations by hand or leverage the [deduplicate-source](https://github.com/dbt-labs/dbt-utils#deduplicate-source) macro from the [dbt-utils](https://github.com/dbt-labs/dbt-utils) package. #### Sample dedupe implementation using `dbt`: diff --git a/docs/implementation/cli.md b/docs/implementation/cli.md index 92d77fa37..cd0666d34 100644 --- a/docs/implementation/cli.md +++ b/docs/implementation/cli.md @@ -99,7 +99,7 @@ The SDK automatically applies selection logic as described by the Selection rules are applied at three levels: -1. **Streams** are filtered out if they are deselected or ommitted in the input catalog. +1. **Streams** are filtered out if they are deselected or omitted in the input catalog. 2. **RECORD messages** are filtered based upon selection rules in the input catalog. 3. **SCHEMA messages** are filtered based upon selection rules in the input catalog. diff --git a/docs/partitioning.md b/docs/partitioning.md index 144d210cf..409ba079d 100644 --- a/docs/partitioning.md +++ b/docs/partitioning.md @@ -4,7 +4,7 @@ The Tap SDK supports stream partitioning, meaning a set of substreams which each have their own state and their own distinct queryable domain. You can read more about state partitioning in the -[State Implemetation](./implementation/state.md#partitioned-state) explanation +[State Implementation](./implementation/state.md#partitioned-state) explanation document. ## If you do not require partitioning diff --git a/docs/stream_maps.md b/docs/stream_maps.md index 9df083e9b..f9d5711a0 100644 --- a/docs/stream_maps.md +++ b/docs/stream_maps.md @@ -175,7 +175,7 @@ to expressions using the `config` dictionary. ### Constructing Expressions Expressions are defined and parsed using the -[`simpleval`](https://github.com/danthedeckie/simpleeval) expression library. This library +[`simpleeval`](https://github.com/danthedeckie/simpleeval) expression library. This library accepts most native python expressions and is extended by custom functions which have been declared within the SDK. @@ -499,7 +499,7 @@ faker_config: locale: en_US ``` -Remember, these expressions are evaluated by the [`simpleval`](https://github.com/danthedeckie/simpleeval) expression library, which only allows a single python expression (which is the reason for the `or` syntax above). +Remember, these expressions are evaluated by the [`simpleeval`](https://github.com/danthedeckie/simpleeval) expression library, which only allows a single python expression (which is the reason for the `or` syntax above). This means if you require more advanced masking logic, which cannot be defined in a single python expression, you may need to consider a custom stream mapper. @@ -749,7 +749,7 @@ excluded at the tap level, then the stream will be skipped exactly as if it were in the catalog metadata. If a stream is specified to be excluded at the target level, or in a standalone mapper -between the tap and target, the filtering occurs downstream from the tap and therefor cannot +between the tap and target, the filtering occurs downstream from the tap and therefore cannot affect the selection rules of the tap itself. Except in special test cases or in cases where runtime is trivial, we highly recommend implementing stream-level exclusions at the tap level rather than within the downstream target or mapper plugins. diff --git a/pyproject.toml b/pyproject.toml index 0484ae81d..17b999d22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -421,3 +421,7 @@ max-args = 9 [tool.uv] required-version = ">=0.5.19" + +[tool.codespell] +skip = "*.csv,samples/aapl/*.json,samples/*/schemas/*.json" +ignore-words-list = "fo,intoto" diff --git a/singer_sdk/connectors/sql.py b/singer_sdk/connectors/sql.py index 5e451ec9c..9abd68afa 100644 --- a/singer_sdk/connectors/sql.py +++ b/singer_sdk/connectors/sql.py @@ -919,7 +919,7 @@ def get_object_names( # pragma: no cover view_names = [] return [(t, False) for t in table_names] + [(v, True) for v in view_names] - # TODO maybe should be splitted into smaller parts? + # TODO maybe should be split into smaller parts? def discover_catalog_entry( self, engine: Engine, # noqa: ARG002 diff --git a/singer_sdk/contrib/filesystem/stream.py b/singer_sdk/contrib/filesystem/stream.py index 4c1b285de..9683e6625 100644 --- a/singer_sdk/contrib/filesystem/stream.py +++ b/singer_sdk/contrib/filesystem/stream.py @@ -60,7 +60,7 @@ def __init__( super().__init__(tap, schema=None, name=name) - # TODO(edgarrmondragon): Make this None if the filesytem does not support it. + # TODO(edgarrmondragon): Make this None if the filesystem does not support it. self.replication_key = SDC_META_MODIFIED_AT self._sync_start_time = utc_now() self._partitions = [{SDC_META_FILEPATH: path} for path in self._filepaths] diff --git a/singer_sdk/helpers/capabilities.py b/singer_sdk/helpers/capabilities.py index b1f18887a..450df6c67 100644 --- a/singer_sdk/helpers/capabilities.py +++ b/singer_sdk/helpers/capabilities.py @@ -65,7 +65,7 @@ description=( "Config for the [`Faker`](https://faker.readthedocs.io/en/master/) " "instance variable `fake` used within map expressions. Only applicable if " - "the plugin specifies `faker` as an addtional dependency (through the " + "the plugin specifies `faker` as an additional dependency (through the " "`singer-sdk` `faker` extra or directly)." ), ), @@ -340,7 +340,7 @@ class PluginCapabilities(CapabilitiesEnum): #: Support :doc:`inline stream map transforms`. STREAM_MAPS = "stream-maps" - #: Support schema flattening, aka denesting of complex properties. + #: Support schema flattening, aka unnesting of complex properties. FLATTENING = "schema-flattening" #: Support the diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index d528831a1..db518485a 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -667,7 +667,7 @@ def _init_faker_instance(self) -> Faker | None: class PluginMapper: - """Inline map tranformer.""" + """Inline map transformer.""" def __init__( self, diff --git a/singer_sdk/plugin_base.py b/singer_sdk/plugin_base.py index fcaa873b1..e9017add1 100644 --- a/singer_sdk/plugin_base.py +++ b/singer_sdk/plugin_base.py @@ -257,7 +257,7 @@ def initialized_at(self) -> int: def capabilities(self) -> list[CapabilitiesEnum]: # noqa: PLR6301 """Get capabilities. - Developers may override this property in oder to add or remove + Developers may override this property in order to add or remove advertised capabilities for this plugin. Returns: diff --git a/singer_sdk/sinks/sql.py b/singer_sdk/sinks/sql.py index 855ee351f..9a82e59bd 100644 --- a/singer_sdk/sinks/sql.py +++ b/singer_sdk/sinks/sql.py @@ -91,7 +91,7 @@ def schema_name(self) -> str | None: Returns: The target schema name. """ - # Look for a default_target_scheme in the configuraion fle + # Look for a default_target_scheme in the configuration file default_target_schema: str = self.config.get("default_target_schema", None) parts = self.stream_name.split("-") diff --git a/tests/core/test_streams.py b/tests/core/test_streams.py index 8b595ac0d..8e4f164cc 100644 --- a/tests/core/test_streams.py +++ b/tests/core/test_streams.py @@ -444,7 +444,7 @@ def records_jsonpath(cls): # noqa: N805 { "link": [ { - "releation": "previous", + "relation": "previous", "url": "https://myapi.test/6" }, {