-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun_extraction_example.py
55 lines (42 loc) · 2.01 KB
/
run_extraction_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
from musif.config import GenericConfiguration
from musif.extract.features.core.constants import FILE_NAME
import os
from pathlib import Path
import pandas as pd
from musif.extract.extract import FeaturesExtractor
from musif.process.processor import DataProcessor
# MAIN FILE to run extractions of data by Didone Project.
# directory containing xml files
data_dir = Path("xml_example") / "example.xml"
DEST_PATH = "destination_path"
# directory containing .pkl files in case of previous extractions for cache
cache_dir = None
# csv file containing files which raised error and need to be reextracted
try:
path_error = f'{DEST_PATH}/error_files.csv'
errored_files = list(pd.read_csv(path_error, low_memory=False)[FILE_NAME])
except Exception:
# Handle the case where there is no file is empty
print("There is no error_files.csv, it will be created and loaded error files are included manually in it.")
pass
# In case a partial extraction has been run, set here the previous df to avoid re-extracting these files.
# prev_path = str(prefix / NAME) + '.csv'
# exclude_files = list(pd.read_csv('martiser/extractions/total_8_12.csv', low_memory=False)['FileName'])
# In case only some files need to be extracted.
# xml_files = [filename for filename in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, filename)) and filename.endswith('.xml')]
# limit_files = xml_files[0:len(xml_files)//4]
extracted_df = FeaturesExtractor(
GenericConfiguration("config_extraction_example.yml"),
data_dir = str(data_dir),
# musescore_dir = Path("data") / "musescore", #only for harmonic analysis
# exclude_files = exclude_files,
# limit_files = limit_files,
cache_dir=cache_dir,
).extract()
extracted_df.to_csv(str(DEST_PATH)+'.csv', index=False)
# The raw df will be now saved in the DEST_PATH, and now post-processed
# by the Didone Processor, and ALSO saved in 4 separated csv files.
p = DataProcessor(str(DEST_PATH) + '.csv', "config_postprocess_example.yml")
p.process()
p.save(str(DEST_PATH))