Skip to content

Commit

Permalink
Merge pull request #1 from RahulVadisetty91/RahulVadisetty91-patch-1
Browse files Browse the repository at this point in the history
Enhanced Text Processing with AI Integration
  • Loading branch information
RahulVadisetty91 authored Aug 22, 2024
2 parents 23d7a5a + dc410a2 commit 484979f
Showing 1 changed file with 278 additions and 0 deletions.
278 changes: 278 additions & 0 deletions ai_text_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
import os
import re
import regex
import sys
from transformers import pipeline # Import the AI model

# Initialize the NLP pipeline for text generation or analysis
nlp_pipeline = pipeline('text-generation', model='gpt-2')

def _unnumber_chaps_and_secs(lines):
# Preface, Installation, and Notation are unnumbered chapters
NUM_UNNUMBERED_CHAPS = 3
# Preliminaries
TOC2_START_CHAP_NO = 5

preface_reached = False
ch2_reached = False
num_chaps = 0
for i, l in enumerate(lines):
if l.startswith('\\chapter{'):
num_chaps += 1
# Unnumber unnumbered chapters
if num_chaps <= NUM_UNNUMBERED_CHAPS:
chap_name = re.split('{|}', l)[1]
lines[i] = ('\\chapter*{' + chap_name
+ '}\\addcontentsline{toc}{chapter}{'
+ chap_name + '}\n')
# Set tocdepth to 2 after Chap 1
elif num_chaps == TOC2_START_CHAP_NO:
lines[i] = ('\\addtocontents{toc}{\\protect\\setcounter{tocdepth}{2}}\n'
+ lines[i])
# Unnumber all sections in unnumbered chapters
elif 1 <= num_chaps <= NUM_UNNUMBERED_CHAPS:
if (l.startswith('\\section') or l.startswith('\\subsection')
or l.startswith('\\subsubsection')):
lines[i] = l.replace('section{', 'section*{')

# Since we inserted '\n' in some lines[i], re-build the list
lines = '\n'.join(lines).split('\n')

def _sec_to_chap(lines):
for i, l in enumerate(lines):
longest_balanced_braces = regex.findall('\{(?>[^{}]|(?R))*\}', l)
for src in longest_balanced_braces:
if src.startswith('{Section \\ref') and 'index:' in src:
tgt = src.replace('Section \\ref', 'Chapter \\ref')
lines[i] = lines[i].replace(src, tgt)

def _pagenumbering(lines):
BEGINDOC = '\\begin{document}'
FRONTNUMS = ['\\pagenumbering{roman}',
'\\pagestyle{empty}',
'\\halftitle',
'\\cleardoublepage']
INTRONUMS = ['\\mainmatter', '\\pagenumbering{arabic}', '\\setcounter{page}{1}']
CHAPINTRO = '\\chapter{Introduction}'
chapintro_i = -1
for i, l in enumerate(lines):
if l.startswith(BEGINDOC):
frontnums_i = i + 1
elif l.startswith(CHAPINTRO):
chapintro_i = i
break
for i, v in enumerate(FRONTNUMS):
lines.insert(frontnums_i + i, v)
for i, v in enumerate(INTRONUMS):
if chapintro_i > 0:
lines.insert(chapintro_i + len(FRONTNUMS) + i, v)

def _replace_chars_in_chapter_title_and_caption(lines):
CAP_CHAP = {'\\chapter{', '\\section{', '\\caption{'}

def _get_replaced(s):
BEFORES = ['’', '“', '”', '–']
AFTERS = ['\'', '``', '\'\'', '--']
for before, after in zip(BEFORES, AFTERS):
s = s.replace(before, after)
return s

i = 0
while i < len(lines):
if any(lines[i].startswith(cap_chap) for cap_chap in CAP_CHAP):
num_lefts = 0
found_end = False
while not found_end:
j_start = 0
j_end = len(lines[i])
for j, char in enumerate(lines[i]):
if char == '{':
num_lefts += 1
if num_lefts == 1:
j_start = j + 1
elif char == '}':
num_lefts -= 1
if num_lefts == 0:
j_end = j
found_end = True
break
lines[i] = lines[i][:j_start] + _get_replaced(lines[i][j_start:j_end]) + lines[i][j_end:]
if not found_end:
i += 1
i += 1

for i, l in enumerate(lines):
if l.startswith('\\chapter{') or l.startswith('\\section{'):
lines[i] = lines[i].replace('--', '\(-\)')

def _edit_titlepage(pdf_dir):
smanual = os.path.join(pdf_dir, 'sphinxmanual.cls')
with open(smanual, 'r') as f:
lines = f.read().split('\n')

for i, l in enumerate(lines):
lines[i] = l.replace('\\@date', '')

with open(smanual, 'w') as f:
f.write('\n'.join(lines))

def delete_lines(lines, deletes):
return [line for i, line in enumerate(lines) if i not in deletes]

def _delete_discussions_title(lines):
deletes = []
to_delete = False
for i, l in enumerate(lines):
if 'section*{Discussion' in l or 'section{Discussion' in l:
to_delete = True
elif to_delete and '\\sphinxincludegraphics' in l:
to_delete = False
if to_delete:
deletes.append(i)
return delete_lines(lines, deletes)

def _protect_hyperlink_in_caption(lines):
def _get_num_extra_left_braces(l, num_extra_left_braces):
num = num_extra_left_braces
for char in l:
if char == '{':
num += 1
elif char == '}':
num -= 1
if num == 0:
return 0
return num

i = 0
while i < len(lines):
if lines[i].startswith('\\caption{') or lines[i].startswith('\\sphinxcaption{'):
num_extra_left_braces = _get_num_extra_left_braces(lines[i], 0)
if num_extra_left_braces == 0:
j = i
else:
j = i + 1
while j < len(lines):
num_extra_left_braces = _get_num_extra_left_braces(
lines[j], num_extra_left_braces)
if num_extra_left_braces == 0:
break
j += 1
for index in range(i, j + 1):
lines[index] = lines[index].replace('\\hyperlink', '\\protect\\hyperlink')
i = j + 1
else:
i += 1

def _remove_appendix_numbering_and_rename_bib(lines):
BEGIN_APPENDIX = '\\chapter{Appendix'
BEGIN_BIB = '\\begin{sphinxthebibliography'
END_APPENDIX = ['\\endappendix',
'\\renewcommand\\bibname{References}'
]

found_begin_appendix = False
one_appendix = True
for i, l in enumerate(lines):
if l.startswith(BEGIN_APPENDIX):
lines[i] = lines[i].replace('\\chapter{Appendix: ', '\\chapter{')
if found_begin_appendix:
one_appendix = False
else:
appendix_i = i
found_begin_appendix = True
elif l.startswith(BEGIN_BIB):
bib_i = i

for i, v in enumerate(END_APPENDIX):
lines.insert(bib_i + i, v)
if one_appendix:
lines.insert(appendix_i, '\\oneappendix')
else:
lines.insert(appendix_i, '\\appendix')

def _fit_chapter_titles(lines):
for i, l in enumerate(lines):
if l.startswith('\\chapter{Mathematics for Deep Learning}'):
lines[i] = '\\chapter[Mathematics for Deep Learning]{Mathematics for Deep\\\\Learning}'
if l.startswith('\\chapter{Linear Neural Networks for Classification}'):
lines[i] = '\\chapter[Linear Neural Networks for Classification]{\\raisebox{-12pt}{Linear Neural Networks for Classification}}'

def _remove_footnote_trailing_space(lines):
seen_discussion_url = False
for i, l in enumerate(lines):
if l.startswith('\sphinxnolinkurl{'):
lines[i] += '\\sphinxAtStartFootnote'
if l.startswith('\\sphinxhref{https://discuss.d2l.ai/t/'):
seen_discussion_url = True
if seen_discussion_url and l.startswith('\\end{footnote}'):
lines[i] += '.'
seen_discussion_url = False

def _add_extra_line_before_endbib(lines):
for i, l in enumerate(lines):
if l.startswith('\\end{sphinxthebibliography}'):
break
lines.insert(i, '')

def _remove_index(lines):
for i, l in enumerate(lines):
j_start = 0
while j_start < len(l)-6:
if l[j_start:j_start+7] == '\\index{':
j = j_start + 7
num_extra_left_braces = 1
while num_extra_left_braces > 0:
if l[j] == '{':
num_extra_left_braces += 1
elif l[j] == '}':
num_extra_left_braces -= 1
j += 1
enclosed_text = l[j_start+7:j-1]
lines[i] = lines[i].replace('\\index{' + enclosed_text + '}', '')
j_start = j
else:
j_start += 1

def _fix_indent_at_chap_start(lines):
is_chap_start = False
for i, l in enumerate(lines):
if l.startswith('\\chapter'):
is_chap_start = True
if is_chap_start and l.startswith('\\sphinxAtStartPar'):
lines[i] = ''
is_chap_start = False

def _ai_text_analysis(lines):
""" Use AI to analyze and enhance text content """
for i, l in enumerate(lines):
# Here we use the AI model to analyze the text and suggest corrections or improvements
result = nlp_pipeline(l, max_length=50, num_return_sequences=1)
lines[i] = result[0]['generated_text']

def main():
tex_file = sys.argv[1]
with open(tex_file, 'r') as f:
lines = f.read().split('\n')

_unnumber_chaps_and_secs(lines)
_sec_to_chap(lines)
#lines = _delete_discussions_title(lines)
_protect_hyperlink_in_caption(lines)
_pagenumbering(lines)
_replace_chars_in_chapter_title_and_caption(lines)
_remove_appendix_numbering_and_rename_bib(lines)
_fit_chapter_titles(lines)
_remove_footnote_trailing_space(lines)
_add_extra_line_before_endbib(lines)
_remove_index(lines)
_fix_indent_at_chap_start(lines)
_ai_text_analysis(lines) # Integrate AI text analysis

with open(tex_file, 'w') as f:
f.write('\n'.join(lines))

pdf_dir = os.path.dirname(tex_file)
#_edit_titlepage(pdf_dir)

if __name__ == "__main__":
main()

0 comments on commit 484979f

Please sign in to comment.