Skip to content

Commit

Permalink
fix encoding for broken microsoft word html format
Browse files Browse the repository at this point in the history
  • Loading branch information
noisecode3 committed Nov 3, 2024
1 parent 841ffcc commit c1a1fc4
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 29 deletions.
14 changes: 8 additions & 6 deletions database/https.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,14 @@ def get_response(self, url, content_type):
def pack_response_buffer(self, content_type, response_buffer):
"""Validate and return the response based on content type"""
if content_type == 'text/html':
return response_buffer.getvalue().decode('utf-8', errors='ignore')
raw_data = response_buffer.getvalue()
for encoding in ['utf-8', 'windows-1252', 'utf-16', 'utf-32']:
try:
return raw_data.decode(encoding)
except UnicodeDecodeError:
continue
logging.error("No known encoding")
sys.exit(1)
if content_type == 'application/json':
return json.loads(response_buffer.getvalue().decode('utf-8'))
if content_type in ['image/jpeg', 'image/png']:
Expand Down Expand Up @@ -357,7 +364,6 @@ def download_file(self, url):
return {} # Return an empty dict on error

self.status = 0
print("Downloaded successfully.")

# Finalize MD5 checksum
md5_hash = hashlib.md5(usedforsecurity=False)
Expand Down Expand Up @@ -413,7 +419,3 @@ def release_lock():
def is_locked():
"""Lock this instance"""
ACQUIRE_LOCK.is_locked()


#if __name__ == '__main__':
# print(get("https://www.trle.net/scadm/trle_dl.php?lid=3667", 'application/zip'))
19 changes: 19 additions & 0 deletions database/ideas.txt
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,22 @@ class Downloader:
Never forget how we can test one function in python:
python3 -c "from index_scrape import get_trle_page; print(get_trle_page(0, True))"


Tracking TRLE Records Efficiently

For TRCustoms, tracking and indexing records is streamlined and fast through JSON.

However, tracking records on TRLE is more complex. One approach could involve scanning
the latest pages and identifying any gaps in ID numbers. These gaps may indicate
deleted records, though it’s likely not necessary to handle them differently. However,
if a new record’s ID deviates significantly from the last known increment—for instance,
going from IDs 666, 667, and 668 to a much lower number like 345—then records
with these IDs should be checked for updates if they already exist in our database.

A full resync of TRLE data will be infrequent, as it would require around 3,700 requests,
potentially taking 1-2 days—something not feasible for regular users. To keep our
index database relevant and manageable, a yearly refresh should suffice. This approach
will focus on creating an index database of around 500 MB, rather than replicating
the entire TRLE database, which could exceed 20 GB. Additional data, such as levels
of specific interest to users, can be cached or downloaded manually, within a reasonable
limit of around 2 GB.
23 changes: 0 additions & 23 deletions database/index_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,26 +353,3 @@ def get_key(id_number):

print("Serial Number:", serial_number)

#return validate_downloaded_key(id_number, serial_number)


'''
# Create a temporary file to hold the combined certificate
with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_cert_file:
# Write all certificates into the temporary file
for cert in cert_list:
temp_cert_file.write(cert)
# Store the name of the temporary file
temp_cert_filename = temp_cert_file.name
# Now use this temporary file with requests for SSL verification
response = requests.get(url, verify=temp_cert_filename, timeout=5)
# Once done, you can clean up the temporary file
import os
os.remove(temp_cert_filename)
'''

#ab:a9:b5:e7:a4:8c:f3:fc:5a:73:da:16:04:36:03:20
#https://crt.sh/?serial=ab%3Aa9%3Ab5%3Ae7%3Aa4%3A8c%3Af3%3Afc%3A5a%3A73%3Ada%3A16%3A04%3A36%3A03%3A20
21 changes: 21 additions & 0 deletions database/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re
# Replace custom tags with HTML spans and apply classes
# https://raw.githubusercontent.com/rr-/TRCustoms/develop/frontend/src/components/markdown-composer/MarkdownButtons/index.tsx
def custom_markdown_parser(text):
text = re.sub(r'\[o\](.*?)\[/o\]', r'<span class="object">\1</span>', text) # blue text for objects
text = re.sub(r'\[s\](.*?)\[/s\]', r'<span class="secret">\1</span>', text) # secret styling
text = re.sub(r'\[p\](.*?)\[/p\]', r'<span class="pickup">\1</span>', text) # pickup styling
text = re.sub(r'\[e\](.*?)\[/e\]', r'<span class="enemy">\1</span>', text) # enemy styling
text = re.sub(r'\[t\](.*?)\[/t\]', r'<span class="trap">\1</span>', text) # trap styling
text = re.sub(r'\[center\](.*?)\[/center\]', r'<div style="text-align: center;">\1</div>', text) # center align

return text

# Example usage
#description = """[center]**Tomb Raider: Pandora's Box**[/center]
#[s]Secret text[/s] [o]Object text[/o] [p]Pickup text[/p]"""

#parsed_description = custom_markdown_parser(description)
#print(parsed_description)


0 comments on commit c1a1fc4

Please sign in to comment.