Skip to content

Commit

Permalink
Bumping version to 0.0.5
Browse files Browse the repository at this point in the history
  • Loading branch information
igorborgest committed Sep 28, 2019
1 parent a4c9815 commit 82a27c9
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 23 deletions.
2 changes: 1 addition & 1 deletion awswrangler/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__title__ = "awswrangler"
__description__ = "Utility belt to handle data on AWS."
__version__ = "0.0.4"
__version__ = "0.0.5"
__license__ = "Apache License 2.0"
49 changes: 37 additions & 12 deletions awswrangler/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def read_csv(
max_result_size=None,
header="infer",
names=None,
usecols=None,
dtype=None,
sep=",",
lineterminator="\n",
Expand All @@ -71,6 +72,7 @@ def read_csv(
:param max_result_size: Max number of bytes on each request to S3
:param header: Same as pandas.read_csv()
:param names: Same as pandas.read_csv()
:param usecols: Same as pandas.read_csv()
:param dtype: Same as pandas.read_csv()
:param sep: Same as pandas.read_csv()
:param lineterminator: Same as pandas.read_csv()
Expand All @@ -96,6 +98,7 @@ def read_csv(
max_result_size=max_result_size,
header=header,
names=names,
usecols=usecols,
dtype=dtype,
sep=sep,
lineterminator=lineterminator,
Expand All @@ -113,6 +116,7 @@ def read_csv(
key_path=key_path,
header=header,
names=names,
usecols=usecols,
dtype=dtype,
sep=sep,
lineterminator=lineterminator,
Expand All @@ -133,6 +137,7 @@ def _read_csv_iterator(
max_result_size=200_000_000, # 200 MB
header="infer",
names=None,
usecols=None,
dtype=None,
sep=",",
lineterminator="\n",
Expand All @@ -155,6 +160,7 @@ def _read_csv_iterator(
:param max_result_size: Max number of bytes on each request to S3
:param header: Same as pandas.read_csv()
:param names: Same as pandas.read_csv()
:param usecols: Same as pandas.read_csv()
:param dtype: Same as pandas.read_csv()
:param sep: Same as pandas.read_csv()
:param lineterminator: Same as pandas.read_csv()
Expand Down Expand Up @@ -182,6 +188,7 @@ def _read_csv_iterator(
key_path=key_path,
header=header,
names=names,
usecols=usecols,
dtype=dtype,
sep=sep,
lineterminator=lineterminator,
Expand Down Expand Up @@ -235,6 +242,7 @@ def _read_csv_iterator(
StringIO(body[:last_char].decode("utf-8")),
header=header,
names=names,
usecols=usecols,
sep=sep,
quotechar=quotechar,
quoting=quoting,
Expand Down Expand Up @@ -353,6 +361,7 @@ def _read_csv_once(
key_path,
header="infer",
names=None,
usecols=None,
dtype=None,
sep=",",
lineterminator="\n",
Expand All @@ -374,6 +383,7 @@ def _read_csv_once(
:param key_path: S3 key path (W/o bucket)
:param header: Same as pandas.read_csv()
:param names: Same as pandas.read_csv()
:param usecols: Same as pandas.read_csv()
:param dtype: Same as pandas.read_csv()
:param sep: Same as pandas.read_csv()
:param lineterminator: Same as pandas.read_csv()
Expand All @@ -395,6 +405,7 @@ def _read_csv_once(
buff,
header=header,
names=names,
usecols=usecols,
sep=sep,
quotechar=quotechar,
quoting=quoting,
Expand Down Expand Up @@ -714,7 +725,8 @@ def _data_to_s3_dataset_writer(dataframe,
session_primitives,
file_format,
cast_columns=None,
extra_args=None):
extra_args=None,
isolated_dataframe=False):
objects_paths = []
if not partition_cols:
object_path = Pandas._data_to_s3_object_writer(
Expand All @@ -725,7 +737,8 @@ def _data_to_s3_dataset_writer(dataframe,
session_primitives=session_primitives,
file_format=file_format,
cast_columns=cast_columns,
extra_args=extra_args)
extra_args=extra_args,
isolated_dataframe=isolated_dataframe)
objects_paths.append(object_path)
else:
for keys, subgroup in dataframe.groupby(partition_cols):
Expand All @@ -744,7 +757,8 @@ def _data_to_s3_dataset_writer(dataframe,
session_primitives=session_primitives,
file_format=file_format,
cast_columns=cast_columns,
extra_args=extra_args)
extra_args=extra_args,
isolated_dataframe=True)
objects_paths.append(object_path)
return objects_paths

Expand All @@ -769,7 +783,8 @@ def _data_to_s3_dataset_writer_remote(send_pipe,
session_primitives=session_primitives,
file_format=file_format,
cast_columns=cast_columns,
extra_args=extra_args))
extra_args=extra_args,
isolated_dataframe=True))
send_pipe.close()

@staticmethod
Expand All @@ -780,7 +795,8 @@ def _data_to_s3_object_writer(dataframe,
session_primitives,
file_format,
cast_columns=None,
extra_args=None):
extra_args=None,
isolated_dataframe=False):
fs = s3.get_fs(session_primitives=session_primitives)
fs = pyarrow.filesystem._ensure_filesystem(fs)
s3.mkdir_if_not_exists(fs, path)
Expand All @@ -803,12 +819,14 @@ def _data_to_s3_object_writer(dataframe,
raise UnsupportedFileFormat(file_format)
object_path = "/".join([path, outfile])
if file_format == "parquet":
Pandas.write_parquet_dataframe(dataframe=dataframe,
path=object_path,
preserve_index=preserve_index,
compression=compression,
fs=fs,
cast_columns=cast_columns)
Pandas.write_parquet_dataframe(
dataframe=dataframe,
path=object_path,
preserve_index=preserve_index,
compression=compression,
fs=fs,
cast_columns=cast_columns,
isolated_dataframe=isolated_dataframe)
elif file_format == "csv":
Pandas.write_csv_dataframe(dataframe=dataframe,
path=object_path,
Expand Down Expand Up @@ -848,15 +866,17 @@ def write_csv_dataframe(dataframe,

@staticmethod
def write_parquet_dataframe(dataframe, path, preserve_index, compression,
fs, cast_columns):
fs, cast_columns, isolated_dataframe):
if not cast_columns:
cast_columns = {}

# Casting on Pandas
casted_in_pandas = []
dtypes = copy.deepcopy(dataframe.dtypes.to_dict())
for name, dtype in dtypes.items():
if str(dtype) == "Int64":
dataframe[name] = dataframe[name].astype("float64")
casted_in_pandas.append(name)
cast_columns[name] = "bigint"
logger.debug(f"Casting column {name} Int64 to float64")

Expand Down Expand Up @@ -885,6 +905,11 @@ def write_parquet_dataframe(dataframe, path, preserve_index, compression,
coerce_timestamps="ms",
flavor="spark")

# Casting back on Pandas if necessary
if isolated_dataframe is False:
for col in casted_in_pandas:
dataframe[col] = dataframe[col].astype("Int64")

def to_redshift(
self,
dataframe,
Expand Down
8 changes: 8 additions & 0 deletions building/build-glue-wheel.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -e

cd ..
rm -rf *.egg-info build dist/*.whl
python3.6 setup.py bdist_wheel
rm -rf *.egg-info build
cd building
2 changes: 1 addition & 1 deletion building/publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ set -e

cd ..
rm -fr build dist .egg awswrangler.egg-info
python setup.py sdist bdist_wheel
python setup.py sdist
twine upload dist/*
rm -fr build dist .egg awswrangler.egg-info
7 changes: 7 additions & 0 deletions docs/source/api/awswrangler.data_types.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
awswrangler.data\_types module
==============================

.. automodule:: awswrangler.data_types
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/source/api/awswrangler.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Submodules

awswrangler.athena
awswrangler.cloudwatchlogs
awswrangler.data_types
awswrangler.exceptions
awswrangler.glue
awswrangler.pandas
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
botocore>=1.12.224
boto3>=1.9.224
botocore>=1.12.238
boto3>=1.9.238
pandas>=0.25.1
s3fs>=0.3.4
pyarrow>=0.14.1
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bdist_wheel]
python-tag = py36,py37
python-tag = glue

[metadata]
license_file = LICENSE
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
install_requires=[
"pyarrow>=0.14.1",
"pandas>=0.25.1",
"botocore>=1.12.224",
"boto3>=1.9.224",
"botocore>=1.12.238",
"boto3>=1.9.238",
"s3fs>=0.3.4",
"tenacity>=5.1.1",
"pg8000>=1.13.2",
Expand Down
33 changes: 29 additions & 4 deletions testing/test_awswrangler/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,33 @@ def test_read_csv_iterator(session, bucket, sample, row_num):
assert total_count == row_num


@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30),
("data_samples/small.csv", 100)])
def test_read_csv_usecols(session, bucket, sample, row_num):
boto3.client("s3").upload_file(sample, bucket, sample)
path = f"s3://{bucket}/{sample}"
dataframe = session.pandas.read_csv(path=path, usecols=["id", "name"])
session.s3.delete_objects(path=path)
assert len(dataframe.index) == row_num
assert len(dataframe.columns) == 2


@pytest.mark.parametrize("sample, row_num", [("data_samples/micro.csv", 30),
("data_samples/small.csv", 100)])
def test_read_csv_iterator_usecols(session, bucket, sample, row_num):
boto3.client("s3").upload_file(sample, bucket, sample)
path = f"s3://{bucket}/{sample}"
dataframe_iter = session.pandas.read_csv(path=path,
usecols=[0, 1],
max_result_size=200)
total_count = 0
for dataframe in dataframe_iter:
total_count += len(dataframe.index)
assert len(dataframe.columns) == 2
session.s3.delete_objects(path=path)
assert total_count == row_num


@pytest.mark.parametrize(
"mode, file_format, preserve_index, partition_cols, procs_cpu_bound, factor",
[
Expand Down Expand Up @@ -745,10 +772,8 @@ def test_to_parquet_with_cast_null(


def test_read_sql_athena_with_time_zone(session, bucket, database):
dataframe = session.pandas.read_sql_athena(
sql=
"select current_timestamp as value, typeof(current_timestamp) as type",
database=database)
query = "select current_timestamp as value, typeof(current_timestamp) as type"
dataframe = session.pandas.read_sql_athena(sql=query, database=database)
assert len(dataframe.index) == 1
assert len(dataframe.columns) == 2
assert dataframe["type"][0] == "timestamp with time zone"
Expand Down

0 comments on commit 82a27c9

Please sign in to comment.