-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from RahulVadisetty91/RahulVadisetty91-patch-1
Enhanced Text Processing with AI Integration
- Loading branch information
Showing
1 changed file
with
278 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,278 @@ | ||
import os | ||
import re | ||
import regex | ||
import sys | ||
from transformers import pipeline # Import the AI model | ||
|
||
# Initialize the NLP pipeline for text generation or analysis | ||
nlp_pipeline = pipeline('text-generation', model='gpt-2') | ||
|
||
def _unnumber_chaps_and_secs(lines): | ||
# Preface, Installation, and Notation are unnumbered chapters | ||
NUM_UNNUMBERED_CHAPS = 3 | ||
# Preliminaries | ||
TOC2_START_CHAP_NO = 5 | ||
|
||
preface_reached = False | ||
ch2_reached = False | ||
num_chaps = 0 | ||
for i, l in enumerate(lines): | ||
if l.startswith('\\chapter{'): | ||
num_chaps += 1 | ||
# Unnumber unnumbered chapters | ||
if num_chaps <= NUM_UNNUMBERED_CHAPS: | ||
chap_name = re.split('{|}', l)[1] | ||
lines[i] = ('\\chapter*{' + chap_name | ||
+ '}\\addcontentsline{toc}{chapter}{' | ||
+ chap_name + '}\n') | ||
# Set tocdepth to 2 after Chap 1 | ||
elif num_chaps == TOC2_START_CHAP_NO: | ||
lines[i] = ('\\addtocontents{toc}{\\protect\\setcounter{tocdepth}{2}}\n' | ||
+ lines[i]) | ||
# Unnumber all sections in unnumbered chapters | ||
elif 1 <= num_chaps <= NUM_UNNUMBERED_CHAPS: | ||
if (l.startswith('\\section') or l.startswith('\\subsection') | ||
or l.startswith('\\subsubsection')): | ||
lines[i] = l.replace('section{', 'section*{') | ||
|
||
# Since we inserted '\n' in some lines[i], re-build the list | ||
lines = '\n'.join(lines).split('\n') | ||
|
||
def _sec_to_chap(lines): | ||
for i, l in enumerate(lines): | ||
longest_balanced_braces = regex.findall('\{(?>[^{}]|(?R))*\}', l) | ||
for src in longest_balanced_braces: | ||
if src.startswith('{Section \\ref') and 'index:' in src: | ||
tgt = src.replace('Section \\ref', 'Chapter \\ref') | ||
lines[i] = lines[i].replace(src, tgt) | ||
|
||
def _pagenumbering(lines): | ||
BEGINDOC = '\\begin{document}' | ||
FRONTNUMS = ['\\pagenumbering{roman}', | ||
'\\pagestyle{empty}', | ||
'\\halftitle', | ||
'\\cleardoublepage'] | ||
INTRONUMS = ['\\mainmatter', '\\pagenumbering{arabic}', '\\setcounter{page}{1}'] | ||
CHAPINTRO = '\\chapter{Introduction}' | ||
chapintro_i = -1 | ||
for i, l in enumerate(lines): | ||
if l.startswith(BEGINDOC): | ||
frontnums_i = i + 1 | ||
elif l.startswith(CHAPINTRO): | ||
chapintro_i = i | ||
break | ||
for i, v in enumerate(FRONTNUMS): | ||
lines.insert(frontnums_i + i, v) | ||
for i, v in enumerate(INTRONUMS): | ||
if chapintro_i > 0: | ||
lines.insert(chapintro_i + len(FRONTNUMS) + i, v) | ||
|
||
def _replace_chars_in_chapter_title_and_caption(lines): | ||
CAP_CHAP = {'\\chapter{', '\\section{', '\\caption{'} | ||
|
||
def _get_replaced(s): | ||
BEFORES = ['’', '“', '”', '–'] | ||
AFTERS = ['\'', '``', '\'\'', '--'] | ||
for before, after in zip(BEFORES, AFTERS): | ||
s = s.replace(before, after) | ||
return s | ||
|
||
i = 0 | ||
while i < len(lines): | ||
if any(lines[i].startswith(cap_chap) for cap_chap in CAP_CHAP): | ||
num_lefts = 0 | ||
found_end = False | ||
while not found_end: | ||
j_start = 0 | ||
j_end = len(lines[i]) | ||
for j, char in enumerate(lines[i]): | ||
if char == '{': | ||
num_lefts += 1 | ||
if num_lefts == 1: | ||
j_start = j + 1 | ||
elif char == '}': | ||
num_lefts -= 1 | ||
if num_lefts == 0: | ||
j_end = j | ||
found_end = True | ||
break | ||
lines[i] = lines[i][:j_start] + _get_replaced(lines[i][j_start:j_end]) + lines[i][j_end:] | ||
if not found_end: | ||
i += 1 | ||
i += 1 | ||
|
||
for i, l in enumerate(lines): | ||
if l.startswith('\\chapter{') or l.startswith('\\section{'): | ||
lines[i] = lines[i].replace('--', '\(-\)') | ||
|
||
def _edit_titlepage(pdf_dir): | ||
smanual = os.path.join(pdf_dir, 'sphinxmanual.cls') | ||
with open(smanual, 'r') as f: | ||
lines = f.read().split('\n') | ||
|
||
for i, l in enumerate(lines): | ||
lines[i] = l.replace('\\@date', '') | ||
|
||
with open(smanual, 'w') as f: | ||
f.write('\n'.join(lines)) | ||
|
||
def delete_lines(lines, deletes): | ||
return [line for i, line in enumerate(lines) if i not in deletes] | ||
|
||
def _delete_discussions_title(lines): | ||
deletes = [] | ||
to_delete = False | ||
for i, l in enumerate(lines): | ||
if 'section*{Discussion' in l or 'section{Discussion' in l: | ||
to_delete = True | ||
elif to_delete and '\\sphinxincludegraphics' in l: | ||
to_delete = False | ||
if to_delete: | ||
deletes.append(i) | ||
return delete_lines(lines, deletes) | ||
|
||
def _protect_hyperlink_in_caption(lines): | ||
def _get_num_extra_left_braces(l, num_extra_left_braces): | ||
num = num_extra_left_braces | ||
for char in l: | ||
if char == '{': | ||
num += 1 | ||
elif char == '}': | ||
num -= 1 | ||
if num == 0: | ||
return 0 | ||
return num | ||
|
||
i = 0 | ||
while i < len(lines): | ||
if lines[i].startswith('\\caption{') or lines[i].startswith('\\sphinxcaption{'): | ||
num_extra_left_braces = _get_num_extra_left_braces(lines[i], 0) | ||
if num_extra_left_braces == 0: | ||
j = i | ||
else: | ||
j = i + 1 | ||
while j < len(lines): | ||
num_extra_left_braces = _get_num_extra_left_braces( | ||
lines[j], num_extra_left_braces) | ||
if num_extra_left_braces == 0: | ||
break | ||
j += 1 | ||
for index in range(i, j + 1): | ||
lines[index] = lines[index].replace('\\hyperlink', '\\protect\\hyperlink') | ||
i = j + 1 | ||
else: | ||
i += 1 | ||
|
||
def _remove_appendix_numbering_and_rename_bib(lines): | ||
BEGIN_APPENDIX = '\\chapter{Appendix' | ||
BEGIN_BIB = '\\begin{sphinxthebibliography' | ||
END_APPENDIX = ['\\endappendix', | ||
'\\renewcommand\\bibname{References}' | ||
] | ||
|
||
found_begin_appendix = False | ||
one_appendix = True | ||
for i, l in enumerate(lines): | ||
if l.startswith(BEGIN_APPENDIX): | ||
lines[i] = lines[i].replace('\\chapter{Appendix: ', '\\chapter{') | ||
if found_begin_appendix: | ||
one_appendix = False | ||
else: | ||
appendix_i = i | ||
found_begin_appendix = True | ||
elif l.startswith(BEGIN_BIB): | ||
bib_i = i | ||
|
||
for i, v in enumerate(END_APPENDIX): | ||
lines.insert(bib_i + i, v) | ||
if one_appendix: | ||
lines.insert(appendix_i, '\\oneappendix') | ||
else: | ||
lines.insert(appendix_i, '\\appendix') | ||
|
||
def _fit_chapter_titles(lines): | ||
for i, l in enumerate(lines): | ||
if l.startswith('\\chapter{Mathematics for Deep Learning}'): | ||
lines[i] = '\\chapter[Mathematics for Deep Learning]{Mathematics for Deep\\\\Learning}' | ||
if l.startswith('\\chapter{Linear Neural Networks for Classification}'): | ||
lines[i] = '\\chapter[Linear Neural Networks for Classification]{\\raisebox{-12pt}{Linear Neural Networks for Classification}}' | ||
|
||
def _remove_footnote_trailing_space(lines): | ||
seen_discussion_url = False | ||
for i, l in enumerate(lines): | ||
if l.startswith('\sphinxnolinkurl{'): | ||
lines[i] += '\\sphinxAtStartFootnote' | ||
if l.startswith('\\sphinxhref{https://discuss.d2l.ai/t/'): | ||
seen_discussion_url = True | ||
if seen_discussion_url and l.startswith('\\end{footnote}'): | ||
lines[i] += '.' | ||
seen_discussion_url = False | ||
|
||
def _add_extra_line_before_endbib(lines): | ||
for i, l in enumerate(lines): | ||
if l.startswith('\\end{sphinxthebibliography}'): | ||
break | ||
lines.insert(i, '') | ||
|
||
def _remove_index(lines): | ||
for i, l in enumerate(lines): | ||
j_start = 0 | ||
while j_start < len(l)-6: | ||
if l[j_start:j_start+7] == '\\index{': | ||
j = j_start + 7 | ||
num_extra_left_braces = 1 | ||
while num_extra_left_braces > 0: | ||
if l[j] == '{': | ||
num_extra_left_braces += 1 | ||
elif l[j] == '}': | ||
num_extra_left_braces -= 1 | ||
j += 1 | ||
enclosed_text = l[j_start+7:j-1] | ||
lines[i] = lines[i].replace('\\index{' + enclosed_text + '}', '') | ||
j_start = j | ||
else: | ||
j_start += 1 | ||
|
||
def _fix_indent_at_chap_start(lines): | ||
is_chap_start = False | ||
for i, l in enumerate(lines): | ||
if l.startswith('\\chapter'): | ||
is_chap_start = True | ||
if is_chap_start and l.startswith('\\sphinxAtStartPar'): | ||
lines[i] = '' | ||
is_chap_start = False | ||
|
||
def _ai_text_analysis(lines): | ||
""" Use AI to analyze and enhance text content """ | ||
for i, l in enumerate(lines): | ||
# Here we use the AI model to analyze the text and suggest corrections or improvements | ||
result = nlp_pipeline(l, max_length=50, num_return_sequences=1) | ||
lines[i] = result[0]['generated_text'] | ||
|
||
def main(): | ||
tex_file = sys.argv[1] | ||
with open(tex_file, 'r') as f: | ||
lines = f.read().split('\n') | ||
|
||
_unnumber_chaps_and_secs(lines) | ||
_sec_to_chap(lines) | ||
#lines = _delete_discussions_title(lines) | ||
_protect_hyperlink_in_caption(lines) | ||
_pagenumbering(lines) | ||
_replace_chars_in_chapter_title_and_caption(lines) | ||
_remove_appendix_numbering_and_rename_bib(lines) | ||
_fit_chapter_titles(lines) | ||
_remove_footnote_trailing_space(lines) | ||
_add_extra_line_before_endbib(lines) | ||
_remove_index(lines) | ||
_fix_indent_at_chap_start(lines) | ||
_ai_text_analysis(lines) # Integrate AI text analysis | ||
|
||
with open(tex_file, 'w') as f: | ||
f.write('\n'.join(lines)) | ||
|
||
pdf_dir = os.path.dirname(tex_file) | ||
#_edit_titlepage(pdf_dir) | ||
|
||
if __name__ == "__main__": | ||
main() |