This repository has been archived by the owner on Jul 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadditional_indexes.py
65 lines (52 loc) · 1.63 KB
/
additional_indexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
"""This module implements additional indexes, such as index over song names.
"""
import argparse
import shelve
from collections import defaultdict
from gensim.parsing.porter import PorterStemmer
from typing import List
def pretty_doc(filename: str) -> str:
"""Convert filename to pretty string 'band - song'.
Args:
filename: Path to a file.
Returns:
Pretty formatted song name.
"""
band, name = filename.split("/")[-2:]
name = name.split(".")[0]
return "{} - {}".format(band, name)
def build_name_index(
docs: List[str], stemmer: PorterStemmer
) -> None:
"""Build index from list of song names.
Args:
docs: List of filenames.
stemmer: Gensim porter stemmer.
"""
index_names = defaultdict(dict)
for docId, doc in enumerate(docs):
for token in pretty_doc(doc).split():
term = stemmer.stem(token)
index_names[term][docId] = 1
with shelve.open("index_names") as index:
index.update(index_names)
def arg_parse() -> argparse.Namespace:
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="Additional indexes")
parser.add_argument(
"--root",
dest="root",
help="Lyrics root directory",
default="lyrics/",
type=str,
)
return parser.parse_args()
if __name__ == "__main__":
args = arg_parse()
docs = [
dir + "/" + f
for dir in os.listdir(args.root)
for f in os.listdir(args.root + dir)
]
stemmer = PorterStemmer()
build_name_index(docs, stemmer)