From fa95ed216b706757e3fc1357a03f74fb74cfa6fa Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Thu, 20 Jun 2024 16:33:24 +0200 Subject: [PATCH] Debug and fixes --- audb/core/dependencies.py | 2 +- audb/core/publish.py | 9 +++-- tests/test_dependencies.py | 54 ++++++++++++++++++++---------- tests/test_load.py | 25 ++++++++------ tests/test_publish.py | 67 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 126 insertions(+), 31 deletions(-) diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py index b8ec2204..7954bafd 100644 --- a/audb/core/dependencies.py +++ b/audb/core/dependencies.py @@ -205,7 +205,7 @@ def table_ids(self) -> typing.List[str]: list of table IDs """ - return [table[3:-4] for table in self.tables] + return [os.path.splitext(table[3:])[0] for table in self.tables] @property def tables(self) -> typing.List[str]: diff --git a/audb/core/publish.py b/audb/core/publish.py index 5cc0a68c..fa9bdedd 100644 --- a/audb/core/publish.py +++ b/audb/core/publish.py @@ -259,10 +259,13 @@ def _find_tables( ) -> typing.List[str]: r"""Find altered, new or removed tables and update 'deps'.""" table_ids = list(db) + # PARQUET is default table, + # CSV file is ignored + # if it exists as well table_files = [ - f"db.{table}.csv" - if os.path.exists(os.path.join(db_root, f"db.{table}.csv")) - else f"db.{table}.parquet" + f"db.{table}.parquet" + if os.path.exists(os.path.join(db_root, f"db.{table}.parquet")) + else f"db.{table}.csv" for table in table_ids ] diff --git a/tests/test_dependencies.py b/tests/test_dependencies.py index 2f24d016..c3ab1662 100644 --- a/tests/test_dependencies.py +++ b/tests/test_dependencies.py @@ -24,6 +24,19 @@ "type": 0, "version": "1.0.0", }, + { + "file": "db.speaker.parquet", + "archive": "", + "bit_depth": 0, + "channels": 0, + "checksum": "3hf774jkf7hfjjg775678djjd5e7dfh3", + "duration": 0.0, + "format": "parquet", + "removed": 0, + "sampling_rate": 0, + "type": 0, + "version": "1.0.0", + }, { "file": "file.wav", "archive": "archive2", @@ -45,7 +58,7 @@ def get_entries(column): def test_get_entries(): - assert get_entries("archive") == ["archive1", "archive2"] + assert get_entries("archive") == ["archive1", "", "archive2"] @pytest.fixture( @@ -121,6 +134,7 @@ def test_call(deps): def test_contains(deps): assert "db.files.csv" in deps assert "file.wav" in deps + assert "db.speaker.parquet" in deps assert "not.csv" not in deps @@ -142,13 +156,16 @@ def test_equals(deps): def test_get_item(deps): assert deps["db.files.csv"] == list(ROWS[0].values())[1:] - assert deps["file.wav"] == list(ROWS[1].values())[1:] + print(deps["db.files.csv"]) + print(deps().archive) + assert deps["db.speaker.parquet"] == list(ROWS[1].values())[1:] + assert deps["file.wav"] == list(ROWS[2].values())[1:] with pytest.raises(KeyError, match="non.existing"): deps["non.existing"] def test_archives(deps): - assert deps.archives == get_entries("archive") + assert deps.archives == sorted(get_entries("archive")) def test_files(deps): @@ -164,11 +181,11 @@ def test_removed_media(deps): def test_table_ids(deps): - assert deps.table_ids == ["files"] + assert deps.table_ids == ["files", "speaker"] def test_tables(deps): - assert deps.tables == ["db.files.csv"] + assert deps.tables == ["db.files.csv", "db.speaker.parquet"] def test_archive(deps): @@ -428,10 +445,12 @@ def test_str(deps): # as the representation might vary, # see https://github.com/audeering/audb/issues/422 expected_str = re.compile( - " archive bit_depth channels .+? type version\n" - "db.files.csv archive1 0 0 .+? 0 1.0.0\n" - "file.wav archive2 16 2 .+? 1 1.0.0.*?" + " archive bit_depth channels .+? type version\n" + "db.files.csv archive1 0 0 .+? 0 1.0.0\n" + "db.speaker.parquet 0 0 .+? 0 1.0.0\n" + "file.wav archive2 16 2 .+? 1 1.0.0.*?" ) + print(str(deps)) assert expected_str.match(str(deps)) assert expected_str.match(deps._df.to_string()) @@ -445,7 +464,7 @@ def test_str(deps): ) def test_add_attachment(deps, file, version, archive, checksum): deps._add_attachment(file, version, archive, checksum) - assert len(deps) == 3 + assert len(deps) == len(ROWS) + 1 assert deps.version(file) == version assert deps.archive(file) == archive assert deps.checksum(file) == checksum @@ -486,7 +505,7 @@ def test_add_attachment(deps, file, version, archive, checksum): ) def test_add_media(deps, values): deps._add_media(values) - assert len(deps) == 4 + assert len(deps) == len(ROWS) + 2 # as we added already attachment before for ( file, archive, @@ -520,7 +539,7 @@ def test_add_media(deps, values): ) def test_add_meta(deps, file, version, archive, checksum): deps._add_meta(file, version, archive, checksum) - assert len(deps) == 3 + assert len(deps) == len(ROWS) + 1 assert deps.version(file) == version assert deps.archive(file) == archive assert deps.checksum(file) == checksum @@ -529,9 +548,10 @@ def test_add_meta(deps, file, version, archive, checksum): @pytest.mark.parametrize( "files, expected_length", [ - (["file.wav"], 1), - (["db.files.csv"], 1), - (["file.wav", "db.files.csv"], 0), + (["file.wav"], 2), + (["db.files.csv"], 2), + (["file.wav", "db.files.csv"], 1), + (["file.wav", "db.files.csv", "db.speaker.parquet"], 0), ], ) def test_drop(deps, files, expected_length): @@ -551,7 +571,7 @@ def test_drop(deps, files, expected_length): def test_remove(deps, file): assert not deps.removed(file) deps._remove(file) - assert len(deps) == 2 + assert len(deps) == len(ROWS) assert deps.removed(file) @@ -595,7 +615,7 @@ def test_remove(deps, file): ) def test_update_media(deps, values): deps._update_media(values) - assert len(deps) == 2 + assert len(deps) == len(ROWS) for ( file, archive, @@ -635,6 +655,6 @@ def test_update_media(deps, values): ) def test_update_media_version(deps, files, version): deps._update_media_version(files, version) - assert len(deps) == 2 + assert len(deps) == len(ROWS) for file in files: assert deps.version(file) == version diff --git a/tests/test_load.py b/tests/test_load.py index f126c4ce..ece6f9db 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -33,11 +33,16 @@ def assert_database_tmp_folder_is_deleted(): assert len([d for d in dirs if d.endswith("~")]) == 0 +@pytest.fixture(scope="module", autouse=False) +def storage_format(): + yield "csv" + + @pytest.fixture( scope="module", autouse=True, ) -def dbs(tmpdir_factory, persistent_repository): +def dbs(tmpdir_factory, persistent_repository, storage_format): r"""Publish different versions of the same database. Returns: @@ -96,7 +101,7 @@ def dbs(tmpdir_factory, persistent_repository): audeer.touch(db_root, "extra/folder/file1.txt") audeer.touch(db_root, "extra/folder/file2.txt") audeer.touch(db_root, "extra/folder/sub-folder/file3.txt") - db.save(db_root) + db.save(db_root, storage_format=storage_format) audformat.testing.create_audio_files(db) archives = db["files"]["speaker"].get().dropna().to_dict() audb.publish( @@ -126,7 +131,7 @@ def dbs(tmpdir_factory, persistent_repository): ) os.remove(audeer.path(db_root, "extra/folder/file2.txt")) - db.save(db_root) + db.save(db_root, storage_format=storage_format) audformat.testing.create_audio_files(db) shutil.copy( audeer.path(previous_db_root, audb.core.define.DEPENDENCIES_FILE), @@ -153,7 +158,7 @@ def dbs(tmpdir_factory, persistent_repository): audeer.path(db_root, "extra"), ) - db.save(db_root) + db.save(db_root, storage_format=storage_format) audformat.testing.create_audio_files(db) shutil.copy( audeer.path(previous_db_root, audb.core.define.DEPENDENCIES_FILE), @@ -180,7 +185,7 @@ def dbs(tmpdir_factory, persistent_repository): del db.attachments["file"] os.remove(audeer.path(db_root, "extra/file.txt")) - db.save(db_root) + db.save(db_root, storage_format=storage_format) audformat.testing.create_audio_files(db) file = os.path.join(db_root, db.files[0]) y, sr = audiofile.read(file) @@ -189,7 +194,7 @@ def dbs(tmpdir_factory, persistent_repository): file = db.files[-1] db.pick_files(lambda x: x != file) os.remove(audeer.path(db_root, file)) - db.save(db_root) + db.save(db_root, storage_format=storage_format) shutil.copy( os.path.join(previous_db_root, audb.core.define.DEPENDENCIES_FILE), @@ -217,7 +222,7 @@ def dbs(tmpdir_factory, persistent_repository): fp.write("text") db.drop_tables("train") - db.save(db_root) + db.save(db_root, storage_format=storage_format) audformat.testing.create_audio_files(db) shutil.copy( os.path.join(previous_db_root, audb.core.define.DEPENDENCIES_FILE), @@ -329,7 +334,7 @@ def test_load(dbs, format, version, only_metadata): else: assert os.path.exists(os.path.join(db_root, file)) - # Assert tables are identical and exist as CSV files + # Assert tables are identical and table files exist for table in db: assert os.path.exists(os.path.join(db_root, f"db.{table}.csv")) pd.testing.assert_frame_equal( @@ -381,7 +386,7 @@ def test_load(dbs, format, version, only_metadata): else: assert os.path.exists(file) - # Assert table CSV files exist + # Assert table files exist for table in db: assert os.path.exists(os.path.join(db_root, f"db.{table}.csv")) @@ -686,7 +691,7 @@ def test_load_to(tmpdir, dbs, version, only_metadata): else: assert os.path.exists(os.path.join(db_root, file)) - # Assert tables are identical and exist as CSV files + # Assert tables are identical and exist as files for table in db: assert os.path.exists(os.path.join(db_root, f"db.{table}.csv")) pd.testing.assert_frame_equal( diff --git a/tests/test_publish.py b/tests/test_publish.py index edfa51a0..230798e4 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -1064,6 +1064,73 @@ def test_publish_error_version(tmpdir, repository): audb.publish(db_path, "2.0.0", repository, previous_version="1.0.0?") +def test_publish_parquet_tables(tmpdir, repository): + r"""Test publishing and loading of parquet tables.""" + build_dir = audeer.path(tmpdir, "./build") + audeer.mkdir(build_dir) + data_dir = audeer.mkdir(build_dir, "data") + audio_file = audeer.path(data_dir, "file1.wav") + signal = np.zeros((2, 1000)) + sampling_rate = 8000 + audiofile.write(audio_file, signal, sampling_rate) + name = "test-db" + db = audformat.Database(name) + db.schemes["speaker"] = audformat.Scheme("str") + index = audformat.filewise_index(["data/file1.wav"]) + db["files"] = audformat.Table(index) + db["files"]["speaker"] = audformat.Column(scheme_id="speaker") + db["files"]["speaker"].set(["adam"]) + db.save(build_dir, storage_format="parquet") + print(f"{audeer.list_file_names(build_dir, basenames=True)=}") + + # Publish database + version = "1.0.0" + deps = audb.publish(build_dir, version, repository) + + # Check files are published to repository + repo = audeer.path(repository.host, repository.name) + archive = deps.archive("data/file1.wav") + expected_paths = [ + audeer.path(repo, name, version, "db.parquet"), + audeer.path(repo, name, version, "db.yaml"), + audeer.path(repo, name, "media", version, f"{archive}.zip"), + audeer.path(repo, name, "meta", version, "files.parquet"), + ] + print(f"{expected_paths=}") + print(f"{audeer.list_file_names(repo, recursive=True)=}") + assert audeer.list_file_names(repo, recursive=True) == expected_paths + + assert deps.tables == ["db.files.parquet"] + file = "data/file1.wav" + assert deps.media == [file] + assert deps.bit_depth(file) == 16 + assert deps.channels(file) == signal.shape[0] + assert deps.duration(file) == signal.shape[1] / sampling_rate + assert deps.format(file) == "wav" + assert deps.sampling_rate(file) == sampling_rate + + print( + f"{audeer.list_file_names(audeer.path(repository.host, repository.name), recursive=True)=}" + ) + db = audb.load(name, version=version, verbose=False, full_path=False) + assert db.files == [file] + assert list(db) == ["files"] + assert os.path.exists(audeer.path(db.root, file)) + assert os.path.exists(audeer.path(db.root, "db.files.parquet")) + + # Publish table update + db["files"]["object"] = audformat.Column() + db["files"]["object"].set(["!!!"]) + db.save(build_dir, storage_format="parquet") + version = "1.1.0" + deps = audb.publish(build_dir, version, repository, previous_version="1.0.0") + + assert deps.tables == ["db.files.parquet"] + db = audb.load(name, version=version, verbose=False, full_path=False) + assert db.files == [file] + assert "object" in db["files"].df.columns + + def test_publish_text_media_files(tmpdir, dbs, repository): r"""Test publishing databases containing text files as media files.""" # Create a database, containing text media file