diff --git a/README.md b/README.md index 90dbbfb..7b7ed66 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ The configuration is also captured in [tables_config_util.py](tap_spreadsheets_a "delimiter": "|", "quotechar": '"', "universal_newlines": false, + "skip_initial": 0, "sample_rate": 10, "max_sampling_read": 2000, "max_sampled_files": 3, @@ -102,6 +103,7 @@ Each object in the 'tables' array describes one or more CSV or Excel spreadsheet . Set this key to "ignore" to skip such source files and continue the run. - **field_names**: (optional) An array holding the names of the columns in the targeted files. If not supplied, the first row of each file must hold the desired values. - **universal_newlines**: (optional) Should the source file parsers honor [universal newlines](https://docs.python.org/2.3/whatsnew/node7.html)). Setting this to false will instruct the parser to only consider '\n' as a valid newline identifier. +- **skip_initial**: (optional) How many lines should be skipped. The default is 0. - **sample_rate**: (optional) The sampling rate to apply when reading a source file for sampling in discovery mode. A sampling rate of 1 will sample every line. A sampling rate of 10 (the default) will sample every 10th line. - **max_sampling_read**: (optional) How many lines of the source file should be sampled when in discovery mode attempting to infer a schema. The default is 1000 samples. - **max_sampled_files**: (optional) The maximum number of files in the targeted set that will be sampled. The default is 5. diff --git a/sample_config.json b/sample_config.json index 8d8abab..30e6d41 100644 --- a/sample_config.json +++ b/sample_config.json @@ -8,6 +8,7 @@ "key_properties": [], "format": "csv", "universal_newlines": false, + "skip_initial": 0, "sample_rate": 10, "max_sampling_read": 2000, "max_sampled_files": 3, diff --git a/tap_spreadsheets_anywhere/configuration.py b/tap_spreadsheets_anywhere/configuration.py index 67bc49a..652c25f 100644 --- a/tap_spreadsheets_anywhere/configuration.py +++ b/tap_spreadsheets_anywhere/configuration.py @@ -15,6 +15,7 @@ Required('format'): Any('csv', 'excel', 'json', 'jsonl', 'detect'), Optional('invalid_format_action'): Any('ignore','fail'), Optional('universal_newlines'): bool, + Optional('skip_initial'): int, Optional('selected'): bool, Optional('field_names'): [str], Optional('search_prefix'): str, diff --git a/tap_spreadsheets_anywhere/format_handler.py b/tap_spreadsheets_anywhere/format_handler.py index 2d1fc68..7f843cf 100644 --- a/tap_spreadsheets_anywhere/format_handler.py +++ b/tap_spreadsheets_anywhere/format_handler.py @@ -121,6 +121,7 @@ def mp_readline(self, size=None, keepends=False): def get_row_iterator(table_spec, uri): universal_newlines = table_spec['universal_newlines'] if 'universal_newlines' in table_spec else True + skip_initial = table_spec.get("skip_initial", 0) if 'format' not in table_spec or table_spec['format'] == 'detect': lowered_uri = uri.lower() @@ -153,19 +154,23 @@ def get_row_iterator(table_spec, uri): try: if format == 'csv': reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') - return tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader) + iterator = tap_spreadsheets_anywhere.csv_handler.get_row_iterator(table_spec, reader) elif format == 'excel': reader = get_streamreader(uri, universal_newlines=universal_newlines,newline=None, open_mode='rb') if uri.lower().endswith(".xls"): - return tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader) + iterator = tap_spreadsheets_anywhere.excel_handler.get_legacy_row_iterator(table_spec, reader) else: - return tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader) + iterator = tap_spreadsheets_anywhere.excel_handler.get_row_iterator(table_spec, reader) elif format == 'json': reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') - return tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader) + iterator = tap_spreadsheets_anywhere.json_handler.get_row_iterator(table_spec, reader) elif format == 'jsonl': reader = get_streamreader(uri, universal_newlines=universal_newlines, open_mode='r') - return tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader) + iterator = tap_spreadsheets_anywhere.jsonl_handler.get_row_iterator(table_spec, reader) except (ValueError,TypeError) as err: raise InvalidFormatError(uri,message=err) + for _ in range(skip_initial): + next(iterator) + + return iterator