Skip to content

Commit

Permalink
Debug and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
hagenw committed Jun 20, 2024
1 parent 14750d8 commit fa95ed2
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 31 deletions.
2 changes: 1 addition & 1 deletion audb/core/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def table_ids(self) -> typing.List[str]:
list of table IDs
"""
return [table[3:-4] for table in self.tables]
return [os.path.splitext(table[3:])[0] for table in self.tables]

@property
def tables(self) -> typing.List[str]:
Expand Down
9 changes: 6 additions & 3 deletions audb/core/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,10 +259,13 @@ def _find_tables(
) -> typing.List[str]:
r"""Find altered, new or removed tables and update 'deps'."""
table_ids = list(db)
# PARQUET is default table,
# CSV file is ignored
# if it exists as well
table_files = [
f"db.{table}.csv"
if os.path.exists(os.path.join(db_root, f"db.{table}.csv"))
else f"db.{table}.parquet"
f"db.{table}.parquet"
if os.path.exists(os.path.join(db_root, f"db.{table}.parquet"))
else f"db.{table}.csv"
for table in table_ids
]

Expand Down
54 changes: 37 additions & 17 deletions tests/test_dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,19 @@
"type": 0,
"version": "1.0.0",
},
{
"file": "db.speaker.parquet",
"archive": "",
"bit_depth": 0,
"channels": 0,
"checksum": "3hf774jkf7hfjjg775678djjd5e7dfh3",
"duration": 0.0,
"format": "parquet",
"removed": 0,
"sampling_rate": 0,
"type": 0,
"version": "1.0.0",
},
{
"file": "file.wav",
"archive": "archive2",
Expand All @@ -45,7 +58,7 @@ def get_entries(column):


def test_get_entries():
assert get_entries("archive") == ["archive1", "archive2"]
assert get_entries("archive") == ["archive1", "", "archive2"]


@pytest.fixture(
Expand Down Expand Up @@ -121,6 +134,7 @@ def test_call(deps):
def test_contains(deps):
assert "db.files.csv" in deps
assert "file.wav" in deps
assert "db.speaker.parquet" in deps
assert "not.csv" not in deps


Expand All @@ -142,13 +156,16 @@ def test_equals(deps):

def test_get_item(deps):
assert deps["db.files.csv"] == list(ROWS[0].values())[1:]
assert deps["file.wav"] == list(ROWS[1].values())[1:]
print(deps["db.files.csv"])
print(deps().archive)
assert deps["db.speaker.parquet"] == list(ROWS[1].values())[1:]
assert deps["file.wav"] == list(ROWS[2].values())[1:]
with pytest.raises(KeyError, match="non.existing"):
deps["non.existing"]


def test_archives(deps):
assert deps.archives == get_entries("archive")
assert deps.archives == sorted(get_entries("archive"))


def test_files(deps):
Expand All @@ -164,11 +181,11 @@ def test_removed_media(deps):


def test_table_ids(deps):
assert deps.table_ids == ["files"]
assert deps.table_ids == ["files", "speaker"]


def test_tables(deps):
assert deps.tables == ["db.files.csv"]
assert deps.tables == ["db.files.csv", "db.speaker.parquet"]


def test_archive(deps):
Expand Down Expand Up @@ -428,10 +445,12 @@ def test_str(deps):
# as the representation might vary,
# see https://github.com/audeering/audb/issues/422
expected_str = re.compile(
" archive bit_depth channels .+? type version\n"
"db.files.csv archive1 0 0 .+? 0 1.0.0\n"
"file.wav archive2 16 2 .+? 1 1.0.0.*?"
" archive bit_depth channels .+? type version\n"
"db.files.csv archive1 0 0 .+? 0 1.0.0\n"
"db.speaker.parquet 0 0 .+? 0 1.0.0\n"
"file.wav archive2 16 2 .+? 1 1.0.0.*?"
)
print(str(deps))
assert expected_str.match(str(deps))
assert expected_str.match(deps._df.to_string())

Expand All @@ -445,7 +464,7 @@ def test_str(deps):
)
def test_add_attachment(deps, file, version, archive, checksum):
deps._add_attachment(file, version, archive, checksum)
assert len(deps) == 3
assert len(deps) == len(ROWS) + 1
assert deps.version(file) == version
assert deps.archive(file) == archive
assert deps.checksum(file) == checksum
Expand Down Expand Up @@ -486,7 +505,7 @@ def test_add_attachment(deps, file, version, archive, checksum):
)
def test_add_media(deps, values):
deps._add_media(values)
assert len(deps) == 4
assert len(deps) == len(ROWS) + 2 # as we added already attachment before
for (
file,
archive,
Expand Down Expand Up @@ -520,7 +539,7 @@ def test_add_media(deps, values):
)
def test_add_meta(deps, file, version, archive, checksum):
deps._add_meta(file, version, archive, checksum)
assert len(deps) == 3
assert len(deps) == len(ROWS) + 1
assert deps.version(file) == version
assert deps.archive(file) == archive
assert deps.checksum(file) == checksum
Expand All @@ -529,9 +548,10 @@ def test_add_meta(deps, file, version, archive, checksum):
@pytest.mark.parametrize(
"files, expected_length",
[
(["file.wav"], 1),
(["db.files.csv"], 1),
(["file.wav", "db.files.csv"], 0),
(["file.wav"], 2),
(["db.files.csv"], 2),
(["file.wav", "db.files.csv"], 1),
(["file.wav", "db.files.csv", "db.speaker.parquet"], 0),
],
)
def test_drop(deps, files, expected_length):
Expand All @@ -551,7 +571,7 @@ def test_drop(deps, files, expected_length):
def test_remove(deps, file):
assert not deps.removed(file)
deps._remove(file)
assert len(deps) == 2
assert len(deps) == len(ROWS)
assert deps.removed(file)


Expand Down Expand Up @@ -595,7 +615,7 @@ def test_remove(deps, file):
)
def test_update_media(deps, values):
deps._update_media(values)
assert len(deps) == 2
assert len(deps) == len(ROWS)
for (
file,
archive,
Expand Down Expand Up @@ -635,6 +655,6 @@ def test_update_media(deps, values):
)
def test_update_media_version(deps, files, version):
deps._update_media_version(files, version)
assert len(deps) == 2
assert len(deps) == len(ROWS)
for file in files:
assert deps.version(file) == version
25 changes: 15 additions & 10 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,16 @@ def assert_database_tmp_folder_is_deleted():
assert len([d for d in dirs if d.endswith("~")]) == 0


@pytest.fixture(scope="module", autouse=False)
def storage_format():
yield "csv"


@pytest.fixture(
scope="module",
autouse=True,
)
def dbs(tmpdir_factory, persistent_repository):
def dbs(tmpdir_factory, persistent_repository, storage_format):
r"""Publish different versions of the same database.
Returns:
Expand Down Expand Up @@ -96,7 +101,7 @@ def dbs(tmpdir_factory, persistent_repository):
audeer.touch(db_root, "extra/folder/file1.txt")
audeer.touch(db_root, "extra/folder/file2.txt")
audeer.touch(db_root, "extra/folder/sub-folder/file3.txt")
db.save(db_root)
db.save(db_root, storage_format=storage_format)
audformat.testing.create_audio_files(db)
archives = db["files"]["speaker"].get().dropna().to_dict()
audb.publish(
Expand Down Expand Up @@ -126,7 +131,7 @@ def dbs(tmpdir_factory, persistent_repository):
)
os.remove(audeer.path(db_root, "extra/folder/file2.txt"))

db.save(db_root)
db.save(db_root, storage_format=storage_format)
audformat.testing.create_audio_files(db)
shutil.copy(
audeer.path(previous_db_root, audb.core.define.DEPENDENCIES_FILE),
Expand All @@ -153,7 +158,7 @@ def dbs(tmpdir_factory, persistent_repository):
audeer.path(db_root, "extra"),
)

db.save(db_root)
db.save(db_root, storage_format=storage_format)
audformat.testing.create_audio_files(db)
shutil.copy(
audeer.path(previous_db_root, audb.core.define.DEPENDENCIES_FILE),
Expand All @@ -180,7 +185,7 @@ def dbs(tmpdir_factory, persistent_repository):
del db.attachments["file"]
os.remove(audeer.path(db_root, "extra/file.txt"))

db.save(db_root)
db.save(db_root, storage_format=storage_format)
audformat.testing.create_audio_files(db)
file = os.path.join(db_root, db.files[0])
y, sr = audiofile.read(file)
Expand All @@ -189,7 +194,7 @@ def dbs(tmpdir_factory, persistent_repository):
file = db.files[-1]
db.pick_files(lambda x: x != file)
os.remove(audeer.path(db_root, file))
db.save(db_root)
db.save(db_root, storage_format=storage_format)

shutil.copy(
os.path.join(previous_db_root, audb.core.define.DEPENDENCIES_FILE),
Expand Down Expand Up @@ -217,7 +222,7 @@ def dbs(tmpdir_factory, persistent_repository):
fp.write("text")

db.drop_tables("train")
db.save(db_root)
db.save(db_root, storage_format=storage_format)
audformat.testing.create_audio_files(db)
shutil.copy(
os.path.join(previous_db_root, audb.core.define.DEPENDENCIES_FILE),
Expand Down Expand Up @@ -329,7 +334,7 @@ def test_load(dbs, format, version, only_metadata):
else:
assert os.path.exists(os.path.join(db_root, file))

# Assert tables are identical and exist as CSV files
# Assert tables are identical and table files exist
for table in db:
assert os.path.exists(os.path.join(db_root, f"db.{table}.csv"))
pd.testing.assert_frame_equal(
Expand Down Expand Up @@ -381,7 +386,7 @@ def test_load(dbs, format, version, only_metadata):
else:
assert os.path.exists(file)

# Assert table CSV files exist
# Assert table files exist
for table in db:
assert os.path.exists(os.path.join(db_root, f"db.{table}.csv"))

Expand Down Expand Up @@ -686,7 +691,7 @@ def test_load_to(tmpdir, dbs, version, only_metadata):
else:
assert os.path.exists(os.path.join(db_root, file))

# Assert tables are identical and exist as CSV files
# Assert tables are identical and exist as files
for table in db:
assert os.path.exists(os.path.join(db_root, f"db.{table}.csv"))
pd.testing.assert_frame_equal(
Expand Down
67 changes: 67 additions & 0 deletions tests/test_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,73 @@ def test_publish_error_version(tmpdir, repository):
audb.publish(db_path, "2.0.0", repository, previous_version="1.0.0?")


def test_publish_parquet_tables(tmpdir, repository):
r"""Test publishing and loading of parquet tables."""
build_dir = audeer.path(tmpdir, "./build")
audeer.mkdir(build_dir)
data_dir = audeer.mkdir(build_dir, "data")
audio_file = audeer.path(data_dir, "file1.wav")
signal = np.zeros((2, 1000))
sampling_rate = 8000
audiofile.write(audio_file, signal, sampling_rate)
name = "test-db"
db = audformat.Database(name)
db.schemes["speaker"] = audformat.Scheme("str")
index = audformat.filewise_index(["data/file1.wav"])
db["files"] = audformat.Table(index)
db["files"]["speaker"] = audformat.Column(scheme_id="speaker")
db["files"]["speaker"].set(["adam"])
db.save(build_dir, storage_format="parquet")
print(f"{audeer.list_file_names(build_dir, basenames=True)=}")

# Publish database
version = "1.0.0"
deps = audb.publish(build_dir, version, repository)

# Check files are published to repository
repo = audeer.path(repository.host, repository.name)
archive = deps.archive("data/file1.wav")
expected_paths = [
audeer.path(repo, name, version, "db.parquet"),
audeer.path(repo, name, version, "db.yaml"),
audeer.path(repo, name, "media", version, f"{archive}.zip"),
audeer.path(repo, name, "meta", version, "files.parquet"),
]
print(f"{expected_paths=}")
print(f"{audeer.list_file_names(repo, recursive=True)=}")
assert audeer.list_file_names(repo, recursive=True) == expected_paths

assert deps.tables == ["db.files.parquet"]
file = "data/file1.wav"
assert deps.media == [file]
assert deps.bit_depth(file) == 16
assert deps.channels(file) == signal.shape[0]
assert deps.duration(file) == signal.shape[1] / sampling_rate
assert deps.format(file) == "wav"
assert deps.sampling_rate(file) == sampling_rate

print(
f"{audeer.list_file_names(audeer.path(repository.host, repository.name), recursive=True)=}"
)
db = audb.load(name, version=version, verbose=False, full_path=False)
assert db.files == [file]
assert list(db) == ["files"]
assert os.path.exists(audeer.path(db.root, file))
assert os.path.exists(audeer.path(db.root, "db.files.parquet"))

# Publish table update
db["files"]["object"] = audformat.Column()
db["files"]["object"].set(["!!!"])
db.save(build_dir, storage_format="parquet")
version = "1.1.0"
deps = audb.publish(build_dir, version, repository, previous_version="1.0.0")

assert deps.tables == ["db.files.parquet"]
db = audb.load(name, version=version, verbose=False, full_path=False)
assert db.files == [file]
assert "object" in db["files"].df.columns


def test_publish_text_media_files(tmpdir, dbs, repository):
r"""Test publishing databases containing text files as media files."""
# Create a database, containing text media file
Expand Down

0 comments on commit fa95ed2

Please sign in to comment.