-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbibtex_duplicate_manager.py
182 lines (149 loc) · 7.92 KB
/
bibtex_duplicate_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import bibtexparser
from bibtexparser.bparser import BibTexParser
import re
from collections import defaultdict
import string
import numpy as np
def load_bibtex_files(filenames):
entries = []
for filename in filenames:
with open(filename) as bibtex_file:
parser = BibTexParser(common_strings=True)
bib_database = bibtexparser.load(bibtex_file, parser=parser)
entries.extend(bib_database.entries)
return entries
def is_subsequence(words_a, words_b):
"""Check if words_a is a subsequence of words_b."""
iter_b = iter(words_b)
return all(word in iter_b for word in words_a)
def remove_double_braces(s):
"""Bibtex files sometimes contain double braces around words i.e. {{WORD}}
This complicates comparison between bibtex entries, so we remove them using this function"""
return re.sub(r'\{\{([^}]*)\}\}', r'\1', s)
def find_potential_duplicates(entries, num_tol = 7):
"""Find potential duplicates by comparing if two titles share a subsequence of length > num_tol"""
potential_duplicates = defaultdict(list)
for i in range(len(entries)):
title_i_str = entries[i].get('title', entries[i].get("Title", '') )
title_i_without_brace = remove_double_braces(title_i_str)
title_i = title_i_without_brace.translate(str.maketrans('', '', string.punctuation)).lower().split()
if title_i_str in potential_duplicates.keys():
continue
potential_duplicates[title_i_str].append(entries[i])
for j in range(i+1, len(entries)):
title_j_str = entries[j].get('title', entries[j].get("Title", '') )
title_j_without_brace = remove_double_braces(title_j_str)
title_j = title_j_without_brace.translate(str.maketrans('', '', string.punctuation)).lower().split()
# print("embed")
# import IPython; IPython.embed()
if title_j_str.lower().startswith("Unconventional") and title_i_str.lower().startswith("unconventional superconductivity"):
import IPython; IPython.embed()
# Check if titles have a shared subsequence of num_tol words in the same order
for k in range(len(title_i) - num_tol + 1): # -(num_tol + 1) to ensure we take subsequences of at least 7 words
if is_subsequence(title_i[k:k+num_tol], title_j):
potential_duplicates[title_i_str].append(entries[j])
break
# Only keep lists of entries that have more than one entry (potential duplicates)
duplicates = {title: entries for title, entries in potential_duplicates.items() if len(entries) > 1}
return duplicates
def filter_duplicates(duplicates, filenames):
"""Given a potential set of duplicates, check whether more than one citekeys show up in the file list. Otherwise, no need to replace citekeys"""
duplicates_filtered = {}
for title, entries in duplicates.items():
citekeys = [entry["ID"] for entry in entries]
check = check_citekeys_occurence(filenames, citekeys)
if check:
duplicates_filtered[title] = entries.copy()
for entry in entries:
entry["occurences"] = find_citekey_in_files(filenames, entry["ID"], False)[1]
return duplicates_filtered
def replace_keys_in_tex_files(file_list, old_citekeys, new_citekey):
for old_citekey in old_citekeys:
# Regular expression to find old_citekey within \cite{}
# Takes care of the fact \cite{} might contain multiple citekeys
pattern_str = r'(\\cite\{[^}]*\b)' + re.escape(old_citekey) + r'(\b[^}]*\})'
pattern = re.compile(pattern_str)
for filename in file_list:
with open(filename, 'r') as file:
content = file.read()
# Replace old_citekey with new_citekey
new_content = pattern.sub(r'\1' + new_citekey + r'\2', content)
with open(filename, 'w') as file:
file.write(new_content)
print(f"All occurrences of '{old_citekey}' in {filename} have been replaced with '{new_citekey}'.")
def check_citekeys_occurence(file_list, citekeys):
citekeys_unique = set(citekeys)
found_list = []
for key in citekeys_unique:
found_list.append(find_citekey_in_files(file_list, key)[0])
return np.sum(found_list) > 1
def find_citekey_in_files(file_list, citekey, verbose=False):
# checks how many times `citekey` shows up in `file_list`.
# Regular expression to match \cite{citekey1, citekey2, ...}
pattern_str = r'\\cite\{[^}]*\b' + re.escape(citekey) + r'\b[^}]*\}'
pattern = re.compile(pattern_str)
found = False
num_occured = 0
for filename in file_list:
with open(filename, 'r') as file:
content = file.read()
matches = pattern.findall(content)
if matches:
num_occured = len(matches)
found = True
if verbose:
print(f"The citekey '{citekey}' was found in {filename}:")
for match in matches:
print(f" {match}")
print(f"Occurences: {num_occured}")
return found, num_occured
def check_arXiv(entries, filelist):
entries_new = []
for entry in entries:
if find_citekey_in_files(filelist, entry["ID"])[0]:
if "arxiv" in entry.get("journal", "").lower():
entries_new.append(entry)
return entries_new
if __name__ == "__main__":
# Let's make it interactive!
print("Welcome to the BibTeX duplicate cleaner!")
bib_files = input("Enter the paths to the .bib files, separated by commas: ").split(',')
tex_files = input("Enter the paths to the .tex files, separated by commas: ").split(',')
entries = load_bibtex_files(bib_files)
nonzero_entries = [entry for entry in entries if find_citekey_in_files(tex_files, entry["ID"])[0]]
duplicates = find_potential_duplicates(nonzero_entries, 5)
duplicates_filtered = filter_duplicates(duplicates, tex_files)
# import IPython; IPython.embed()
new_duplicates = {}
for title, duplicate_entries in duplicates_filtered.items():
print("new entry")
items = [(title, duplicate_entries)]
num_tol = 5
while len(items) > 0 :
title2, duplicate_entries2 = items.pop(0)
if len(duplicate_entries2) < 6:
print(f"\nPotential duplicates for '{title2}':")
for i, entry in enumerate(duplicate_entries2):
print(f"{i+1}: {entry.get('title', 'No title')} (key: {entry.get('ID', 'No ID')}) (occur: {entry.get('occurences', 0)})")
confirm = input("Treat these as duplicates? (yes/no) ")
if len(duplicate_entries2) > 5 or confirm.lower() != 'yes':
# when the list of duplicates mistakenly identifies non-duplicate items,
# try comparison again with larger num_tol
current_length = len(duplicate_entries2)
while True:
num_tol += 1
duplicates2 = find_potential_duplicates(duplicate_entries2, num_tol=num_tol)
duplicates_filtered2 = filter_duplicates(duplicates2, tex_files)
# prepend the newly found entries to duplicates
if len(duplicates_filtered2) == 0 or current_length > len(next(iter(duplicates_filtered2.values()))):
break
items = [(title, entries) for (title, entries) in duplicates_filtered2.items()] + items
continue
num_tol = 5
while True:
keep = int(input("Enter the number of the key to keep: ")) - 1
if 0 <= keep < len(duplicate_entries):
break
print("Invalid number!")
replace_keys_in_tex_files(tex_files, [entry.get('ID') for i, entry in enumerate(duplicate_entries2) if i != keep], duplicate_entries2[keep].get('ID'))
print("Done cleaning duplicates!")