From 1a6376155f642f7546c95a1863c8ebdc7375ccf9 Mon Sep 17 00:00:00 2001 From: Kyriakos Psarakis Date: Thu, 14 Jul 2022 16:06:49 +0300 Subject: [PATCH] detect encoding only if it's not provided by the user (#4) --- geovaex/io.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/geovaex/io.py b/geovaex/io.py index c2e1d9f..2742e24 100644 --- a/geovaex/io.py +++ b/geovaex/io.py @@ -190,15 +190,20 @@ def to_arrow(file, arrow_file, chunksize=2000000, crs=None, **kwargs): with pa.OSFile(arrow_file, 'wb') as sink: writer = None # by default assume utf-8 encoding - encoding = 'utf-8' - if not os.path.isfile(file): - cpg_file = [os.path.join(file, f) for f in os.listdir(file) if f.endswith('.cpg')] - if len(cpg_file) == 1: - # if there is a cpg file read the encoding out of its first line - cpg_file = cpg_file[0] - with open(cpg_file) as f: - encoding = f.readline() - for table in to_arrow_table(file, chunksize=chunksize, crs=crs, encoding=encoding, **kwargs): + if 'encoding' not in kwargs: + encoding = 'utf-8' + if not os.path.isfile(file): + cpg_file = [os.path.join(file, f) for f in os.listdir(file) if f.endswith('.cpg')] + if len(cpg_file) == 1: + # if there is a cpg file read the encoding out of its first line + cpg_file = cpg_file[0] + with open(cpg_file) as f: + encoding = f.readline() + arrow_generator = to_arrow_table(file, chunksize=chunksize, crs=crs, encoding=encoding, **kwargs) + else: + arrow_generator = to_arrow_table(file, chunksize=chunksize, crs=crs, **kwargs) + + for table in arrow_generator: b = table.to_batches() if writer is None: writer = pa.RecordBatchStreamWriter(sink, b[0].schema)