Merge pull request #1 from RahulVadisetty91/RahulVadisetty91-patch-1

Enhanced Text Processing with AI Integration
d2l-ai · Aug 22, 2024 · 484979f · 484979f
2 parents 23d7a5a + dc410a2
commit 484979f
Showing 1 changed file with 278 additions and 0 deletions.
diff --git a/ai_text_processor.py b/ai_text_processor.py
@@ -0,0 +1,278 @@
+import os
+import re
+import regex
+import sys
+from transformers import pipeline  # Import the AI model
+
+# Initialize the NLP pipeline for text generation or analysis
+nlp_pipeline = pipeline('text-generation', model='gpt-2')
+
+def _unnumber_chaps_and_secs(lines):
+    # Preface, Installation, and Notation are unnumbered chapters
+    NUM_UNNUMBERED_CHAPS = 3
+    # Preliminaries
+    TOC2_START_CHAP_NO = 5
+
+    preface_reached = False
+    ch2_reached = False
+    num_chaps = 0
+    for i, l in enumerate(lines):
+        if l.startswith('\\chapter{'):
+            num_chaps += 1
+            # Unnumber unnumbered chapters
+            if num_chaps <= NUM_UNNUMBERED_CHAPS:
+                chap_name = re.split('{|}', l)[1]
+                lines[i] = ('\\chapter*{' + chap_name
+                            + '}\\addcontentsline{toc}{chapter}{'
+                            + chap_name + '}\n')
+            # Set tocdepth to 2 after Chap 1
+            elif num_chaps == TOC2_START_CHAP_NO:
+                lines[i] = ('\\addtocontents{toc}{\\protect\\setcounter{tocdepth}{2}}\n'
+                            + lines[i])
+        # Unnumber all sections in unnumbered chapters
+        elif 1 <= num_chaps <= NUM_UNNUMBERED_CHAPS:
+            if (l.startswith('\\section') or l.startswith('\\subsection')
+                    or l.startswith('\\subsubsection')):
+                lines[i] = l.replace('section{', 'section*{')
+
+    # Since we inserted '\n' in some lines[i], re-build the list
+    lines = '\n'.join(lines).split('\n')
+
+def _sec_to_chap(lines):
+    for i, l in enumerate(lines):
+        longest_balanced_braces = regex.findall('\{(?>[^{}]|(?R))*\}', l)
+        for src in longest_balanced_braces:
+            if src.startswith('{Section \\ref') and 'index:' in src:
+                tgt = src.replace('Section \\ref', 'Chapter \\ref')
+                lines[i] = lines[i].replace(src, tgt)
+
+def _pagenumbering(lines):
+    BEGINDOC = '\\begin{document}'
+    FRONTNUMS = ['\\pagenumbering{roman}',
+    '\\pagestyle{empty}',
+    '\\halftitle',
+    '\\cleardoublepage']
+    INTRONUMS = ['\\mainmatter', '\\pagenumbering{arabic}', '\\setcounter{page}{1}']
+    CHAPINTRO = '\\chapter{Introduction}'
+    chapintro_i = -1
+    for i, l in enumerate(lines):
+        if l.startswith(BEGINDOC):
+            frontnums_i = i + 1
+        elif l.startswith(CHAPINTRO):
+            chapintro_i = i
+            break
+    for i, v in enumerate(FRONTNUMS):
+        lines.insert(frontnums_i + i, v)
+    for i, v in enumerate(INTRONUMS):
+        if chapintro_i > 0:
+            lines.insert(chapintro_i + len(FRONTNUMS) + i, v)
+
+def _replace_chars_in_chapter_title_and_caption(lines):
+    CAP_CHAP = {'\\chapter{', '\\section{', '\\caption{'}
+
+    def _get_replaced(s):
+        BEFORES = ['’', '“', '”', '–']
+        AFTERS = ['\'', '``', '\'\'', '--']
+        for before, after in zip(BEFORES, AFTERS):
+            s = s.replace(before, after)
+        return s
+
+    i = 0
+    while i < len(lines):
+        if any(lines[i].startswith(cap_chap) for cap_chap in CAP_CHAP):
+            num_lefts = 0
+            found_end = False
+            while not found_end:
+                j_start = 0
+                j_end = len(lines[i])
+                for j, char in enumerate(lines[i]):
+                    if char == '{':
+                        num_lefts += 1
+                        if num_lefts == 1:
+                            j_start = j + 1
+                    elif char == '}':
+                        num_lefts -= 1
+                        if num_lefts == 0:
+                            j_end = j
+                            found_end = True
+                            break
+                lines[i] = lines[i][:j_start] + _get_replaced(lines[i][j_start:j_end]) + lines[i][j_end:]
+                if not found_end:
+                    i += 1
+        i += 1
+
+    for i, l in enumerate(lines):
+        if l.startswith('\\chapter{') or l.startswith('\\section{'):
+            lines[i] = lines[i].replace('--', '\(-\)')
+
+def _edit_titlepage(pdf_dir):
+    smanual = os.path.join(pdf_dir, 'sphinxmanual.cls')
+    with open(smanual, 'r') as f:
+        lines = f.read().split('\n')
+
+    for i, l in enumerate(lines):
+        lines[i] = l.replace('\\@date', '')
+
+    with open(smanual, 'w') as f:
+        f.write('\n'.join(lines))
+
+def delete_lines(lines, deletes):
+    return [line for i, line in enumerate(lines) if i not in deletes]
+
+def _delete_discussions_title(lines):
+    deletes = []
+    to_delete = False
+    for i, l in enumerate(lines):
+        if 'section*{Discussion' in l or 'section{Discussion' in l:
+            to_delete = True
+        elif to_delete and '\\sphinxincludegraphics' in l:
+            to_delete = False
+        if to_delete:
+            deletes.append(i)
+    return delete_lines(lines, deletes)
+
+def _protect_hyperlink_in_caption(lines):
+    def _get_num_extra_left_braces(l, num_extra_left_braces):
+        num = num_extra_left_braces
+        for char in l:
+            if char == '{':
+                num += 1
+            elif char == '}':
+                num -= 1
+                if num == 0:
+                    return 0
+        return num
+
+    i = 0
+    while i < len(lines):
+        if lines[i].startswith('\\caption{') or lines[i].startswith('\\sphinxcaption{'):
+            num_extra_left_braces = _get_num_extra_left_braces(lines[i], 0)
+            if num_extra_left_braces == 0:
+                j = i
+            else:
+                j = i + 1
+                while j < len(lines):
+                    num_extra_left_braces = _get_num_extra_left_braces(
+                            lines[j], num_extra_left_braces)
+                    if num_extra_left_braces == 0:
+                        break
+                    j += 1
+            for index in range(i, j + 1):
+                lines[index] = lines[index].replace('\\hyperlink', '\\protect\\hyperlink')
+            i = j + 1
+        else:
+            i += 1
+
+def _remove_appendix_numbering_and_rename_bib(lines):
+    BEGIN_APPENDIX = '\\chapter{Appendix'
+    BEGIN_BIB = '\\begin{sphinxthebibliography'
+    END_APPENDIX = ['\\endappendix',
+        '\\renewcommand\\bibname{References}'
+    ]
+
+    found_begin_appendix = False
+    one_appendix = True
+    for i, l in enumerate(lines):
+        if l.startswith(BEGIN_APPENDIX):
+            lines[i] = lines[i].replace('\\chapter{Appendix: ', '\\chapter{')
+            if found_begin_appendix:
+                one_appendix = False
+            else:
+                appendix_i = i
+                found_begin_appendix = True
+        elif l.startswith(BEGIN_BIB):
+            bib_i = i
+
+    for i, v in enumerate(END_APPENDIX):
+        lines.insert(bib_i + i, v)
+    if one_appendix:
+        lines.insert(appendix_i, '\\oneappendix')
+    else:
+        lines.insert(appendix_i, '\\appendix')
+
+def _fit_chapter_titles(lines):
+    for i, l in enumerate(lines):
+        if l.startswith('\\chapter{Mathematics for Deep Learning}'):
+            lines[i] = '\\chapter[Mathematics for Deep Learning]{Mathematics for Deep\\\\Learning}'
+        if l.startswith('\\chapter{Linear Neural Networks for Classification}'):
+            lines[i] = '\\chapter[Linear Neural Networks for Classification]{\\raisebox{-12pt}{Linear Neural Networks for Classification}}'
+
+def _remove_footnote_trailing_space(lines):
+    seen_discussion_url = False
+    for i, l in enumerate(lines):
+        if l.startswith('\sphinxnolinkurl{'):
+            lines[i] += '\\sphinxAtStartFootnote'
+        if l.startswith('\\sphinxhref{https://discuss.d2l.ai/t/'):
+            seen_discussion_url = True
+        if seen_discussion_url and l.startswith('\\end{footnote}'):
+            lines[i] += '.'
+            seen_discussion_url = False
+
+def _add_extra_line_before_endbib(lines):
+    for i, l in enumerate(lines):
+        if l.startswith('\\end{sphinxthebibliography}'):
+            break
+    lines.insert(i, '')
+
+def _remove_index(lines):
+    for i, l in enumerate(lines):
+        j_start = 0
+        while j_start < len(l)-6:
+            if l[j_start:j_start+7] == '\\index{':
+                j = j_start + 7
+                num_extra_left_braces = 1
+                while num_extra_left_braces > 0:
+                    if l[j] == '{':
+                        num_extra_left_braces += 1
+                    elif l[j] == '}':
+                        num_extra_left_braces -= 1
+                    j += 1
+                enclosed_text = l[j_start+7:j-1]
+                lines[i] = lines[i].replace('\\index{' + enclosed_text + '}', '')
+                j_start = j
+            else:
+                j_start += 1
+
+def _fix_indent_at_chap_start(lines):
+    is_chap_start = False
+    for i, l in enumerate(lines):
+        if l.startswith('\\chapter'):
+            is_chap_start = True
+        if is_chap_start and l.startswith('\\sphinxAtStartPar'):
+            lines[i] = ''
+            is_chap_start = False
+
+def _ai_text_analysis(lines):
+    """ Use AI to analyze and enhance text content """
+    for i, l in enumerate(lines):
+        # Here we use the AI model to analyze the text and suggest corrections or improvements
+        result = nlp_pipeline(l, max_length=50, num_return_sequences=1)
+        lines[i] = result[0]['generated_text']
+
+def main():
+    tex_file = sys.argv[1]
+    with open(tex_file, 'r') as f:
+        lines = f.read().split('\n')
+
+    _unnumber_chaps_and_secs(lines)
+    _sec_to_chap(lines)
+    #lines = _delete_discussions_title(lines)
+    _protect_hyperlink_in_caption(lines)
+    _pagenumbering(lines)
+    _replace_chars_in_chapter_title_and_caption(lines)
+    _remove_appendix_numbering_and_rename_bib(lines)
+    _fit_chapter_titles(lines)
+    _remove_footnote_trailing_space(lines)
+    _add_extra_line_before_endbib(lines)
+    _remove_index(lines)
+    _fix_indent_at_chap_start(lines)
+    _ai_text_analysis(lines)  # Integrate AI text analysis
+
+    with open(tex_file, 'w') as f:
+        f.write('\n'.join(lines))
+
+    pdf_dir = os.path.dirname(tex_file)
+    #_edit_titlepage(pdf_dir)
+
+if __name__ == "__main__":
+    main()