Skip to content

Commit

Permalink
detect encoding only if it's not provided by the user (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
kPsarakis authored Jul 14, 2022
1 parent 0161e66 commit 1a63761
Showing 1 changed file with 14 additions and 9 deletions.
23 changes: 14 additions & 9 deletions geovaex/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,20 @@ def to_arrow(file, arrow_file, chunksize=2000000, crs=None, **kwargs):
with pa.OSFile(arrow_file, 'wb') as sink:
writer = None
# by default assume utf-8 encoding
encoding = 'utf-8'
if not os.path.isfile(file):
cpg_file = [os.path.join(file, f) for f in os.listdir(file) if f.endswith('.cpg')]
if len(cpg_file) == 1:
# if there is a cpg file read the encoding out of its first line
cpg_file = cpg_file[0]
with open(cpg_file) as f:
encoding = f.readline()
for table in to_arrow_table(file, chunksize=chunksize, crs=crs, encoding=encoding, **kwargs):
if 'encoding' not in kwargs:
encoding = 'utf-8'
if not os.path.isfile(file):
cpg_file = [os.path.join(file, f) for f in os.listdir(file) if f.endswith('.cpg')]
if len(cpg_file) == 1:
# if there is a cpg file read the encoding out of its first line
cpg_file = cpg_file[0]
with open(cpg_file) as f:
encoding = f.readline()
arrow_generator = to_arrow_table(file, chunksize=chunksize, crs=crs, encoding=encoding, **kwargs)
else:
arrow_generator = to_arrow_table(file, chunksize=chunksize, crs=crs, **kwargs)

for table in arrow_generator:
b = table.to_batches()
if writer is None:
writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
Expand Down

0 comments on commit 1a63761

Please sign in to comment.