fix encoding for broken microsoft word html format

noisecode3 · Nov 3, 2024 · c1a1fc4 · c1a1fc4
1 parent 841ffcc
commit c1a1fc4
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 29 deletions.
diff --git a/database/https.py b/database/https.py
@@ -216,7 +216,14 @@ def get_response(self, url, content_type):
     def pack_response_buffer(self, content_type, response_buffer):
         """Validate and return the response based on content type"""
         if content_type == 'text/html':
-            return response_buffer.getvalue().decode('utf-8', errors='ignore')
+            raw_data = response_buffer.getvalue()
+            for encoding in ['utf-8', 'windows-1252', 'utf-16', 'utf-32']:
+                try:
+                    return raw_data.decode(encoding)
+                except UnicodeDecodeError:
+                    continue
+            logging.error("No known encoding")
+            sys.exit(1)
         if content_type == 'application/json':
             return json.loads(response_buffer.getvalue().decode('utf-8'))
         if content_type in ['image/jpeg', 'image/png']:
@@ -357,7 +364,6 @@ def download_file(self, url):
                 return {}  # Return an empty dict on error
 
             self.status = 0
-            print("Downloaded successfully.")
 
             # Finalize MD5 checksum
             md5_hash = hashlib.md5(usedforsecurity=False)
@@ -413,7 +419,3 @@ def release_lock():
 def is_locked():
     """Lock this instance"""
     ACQUIRE_LOCK.is_locked()
-
-
-#if __name__ == '__main__':
-#    print(get("https://www.trle.net/scadm/trle_dl.php?lid=3667", 'application/zip'))
diff --git a/database/ideas.txt b/database/ideas.txt
@@ -233,3 +233,22 @@ class Downloader:
 Never forget how we can test one function in python:
 python3 -c "from index_scrape import get_trle_page; print(get_trle_page(0, True))"
 
+
+Tracking TRLE Records Efficiently
+
+For TRCustoms, tracking and indexing records is streamlined and fast through JSON.
+
+However, tracking records on TRLE is more complex. One approach could involve scanning
+the latest pages and identifying any gaps in ID numbers. These gaps may indicate
+deleted records, though it’s likely not necessary to handle them differently. However,
+if a new record’s ID deviates significantly from the last known increment—for instance,
+going from IDs 666, 667, and 668 to a much lower number like 345—then records
+with these IDs should be checked for updates if they already exist in our database.
+
+A full resync of TRLE data will be infrequent, as it would require around 3,700 requests,
+potentially taking 1-2 days—something not feasible for regular users. To keep our
+index database relevant and manageable, a yearly refresh should suffice. This approach
+will focus on creating an index database of around 500 MB, rather than replicating
+the entire TRLE database, which could exceed 20 GB. Additional data, such as levels
+of specific interest to users, can be cached or downloaded manually, within a reasonable
+limit of around 2 GB.
diff --git a/database/index_scrape.py b/database/index_scrape.py
@@ -353,26 +353,3 @@ def get_key(id_number):
 
     print("Serial Number:", serial_number)
 
-    #return validate_downloaded_key(id_number, serial_number)
-
-
-'''
-# Create a temporary file to hold the combined certificate
-with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_cert_file:
-    # Write all certificates into the temporary file
-    for cert in cert_list:
-        temp_cert_file.write(cert)
-    
-    # Store the name of the temporary file
-    temp_cert_filename = temp_cert_file.name
-
-# Now use this temporary file with requests for SSL verification
-response = requests.get(url, verify=temp_cert_filename, timeout=5)
-
-# Once done, you can clean up the temporary file
-import os
-os.remove(temp_cert_filename)
-'''
-
-#ab:a9:b5:e7:a4:8c:f3:fc:5a:73:da:16:04:36:03:20
-#https://crt.sh/?serial=ab%3Aa9%3Ab5%3Ae7%3Aa4%3A8c%3Af3%3Afc%3A5a%3A73%3Ada%3A16%3A04%3A36%3A03%3A20
diff --git a/database/parser.py b/database/parser.py
@@ -0,0 +1,21 @@
+import re
+# Replace custom tags with HTML spans and apply classes
+# https://raw.githubusercontent.com/rr-/TRCustoms/develop/frontend/src/components/markdown-composer/MarkdownButtons/index.tsx
+def custom_markdown_parser(text):
+    text = re.sub(r'\[o\](.*?)\[/o\]', r'<span class="object">\1</span>', text)  # blue text for objects
+    text = re.sub(r'\[s\](.*?)\[/s\]', r'<span class="secret">\1</span>', text)   # secret styling
+    text = re.sub(r'\[p\](.*?)\[/p\]', r'<span class="pickup">\1</span>', text)   # pickup styling
+    text = re.sub(r'\[e\](.*?)\[/e\]', r'<span class="enemy">\1</span>', text)    # enemy styling
+    text = re.sub(r'\[t\](.*?)\[/t\]', r'<span class="trap">\1</span>', text)     # trap styling
+    text = re.sub(r'\[center\](.*?)\[/center\]', r'<div style="text-align: center;">\1</div>', text)  # center align
+
+    return text
+
+# Example usage
+#description = """[center]**Tomb Raider: Pandora's Box**[/center]
+#[s]Secret text[/s] [o]Object text[/o] [p]Pickup text[/p]"""
+
+#parsed_description = custom_markdown_parser(description)
+#print(parsed_description)
+
+