From af92a9726e30781b146abf6669710d47f1ad6975 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Jeleni=C4=87?= Date: Thu, 23 Nov 2023 16:33:38 +0100 Subject: [PATCH 1/6] Sets "Pywb Error" string as translatable in error template (#868) --- pywb/templates/error.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/templates/error.html b/pywb/templates/error.html index 2cf9a276a..ba15dbd5f 100644 --- a/pywb/templates/error.html +++ b/pywb/templates/error.html @@ -3,7 +3,7 @@ {% block body %}
-

Pywb Error

+

{{ _('Pywb Error') }}

From 79140441dfb722897e8b9a101552c41c724cc0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Jeleni=C4=87?= Date: Thu, 23 Nov 2023 16:50:56 +0100 Subject: [PATCH 2/6] Fixes switch_locale not adding locale if missing from URL (#871) If the two letter language code was missing in the URI, switch_locale(locale) didn't add it (it worked fine if it was present). That means that it produced the same URL for all locales, each missing the two letter language code in the URL. --- pywb/rewrite/templateview.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/rewrite/templateview.py b/pywb/rewrite/templateview.py index 7f0cbc88e..518df18b2 100644 --- a/pywb/rewrite/templateview.py +++ b/pywb/rewrite/templateview.py @@ -178,7 +178,7 @@ def switch_locale(context, locale): request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO')) - if curr_loc: + if curr_loc and request_uri.startswith('/' + curr_loc + '/'): return request_uri.replace(curr_loc, locale, 1) app_prefix = environ.get('pywb.app_prefix', '') From 013746c10a44ef1bf4daeffce5bb0a27ff4a0a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Jeleni=C4=87?= Date: Thu, 23 Nov 2023 16:56:26 +0100 Subject: [PATCH 3/6] Fixes environ paths when default_locale set (#873) If the default_locale was set and the URL path didn't contain a language code, it was behaving as if there was a language code in the URL. In that case, it was moving part of the PATH_INFO to SCRIPT_NAME, but as there wasn't any language code in the URL, it moved something else. This fixes that. --- pywb/apps/frontendapp.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pywb/apps/frontendapp.py b/pywb/apps/frontendapp.py index a10c7a423..9ebb036fe 100644 --- a/pywb/apps/frontendapp.py +++ b/pywb/apps/frontendapp.py @@ -667,10 +667,14 @@ def handle_request(self, environ, start_response): # store original script_name (original prefix) before modifications are made environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME') - lang = args.pop('lang', self.default_locale) + lang = args.pop('lang', '') if lang: pop_path_info(environ) + + if lang: environ['pywb_lang'] = lang + elif self.default_locale: + environ['pywb_lang'] = self.default_locale response = endpoint(environ, **args) From 7879dd022263200b8b628be8b8539e8114d6082c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ivan=20Jeleni=C4=87?= Date: Thu, 23 Nov 2023 16:59:06 +0100 Subject: [PATCH 4/6] Fixes get_locale_prefixes() wrong paths (#874) If default_locale was set, and a web page was visited that doesn't have a langauge code in the path in the URL, the URL path parts returned by get_locale_prefixes() was wrong (e.g. /hrst/ instead of /hr/test/). --- pywb/rewrite/templateview.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pywb/rewrite/templateview.py b/pywb/rewrite/templateview.py index 518df18b2..208c2f4ca 100644 --- a/pywb/rewrite/templateview.py +++ b/pywb/rewrite/templateview.py @@ -196,11 +196,11 @@ def get_locale_prefixes(context): orig_prefix = environ.get('pywb.app_prefix', '') coll = environ.get('SCRIPT_NAME', '') - if orig_prefix: + if orig_prefix and coll.startswith(orig_prefix): coll = coll[len(orig_prefix):] curr_loc = environ.get('pywb_lang', '') - if curr_loc: + if curr_loc and coll.startswith('/' + curr_loc): coll = coll[len(curr_loc) + 1:] for locale in loc_map.keys(): From 6b4f9b323eaede2c526f9af336b0feba0fec8413 Mon Sep 17 00:00:00 2001 From: Florian Zimmermeister Date: Thu, 23 Nov 2023 17:02:10 +0100 Subject: [PATCH 5/6] Fix code sample syntax in README (#864) --- README.rst | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 87b66902c..9fe11e53a 100644 --- a/README.rst +++ b/README.rst @@ -60,9 +60,7 @@ Installation for Deployment To install pywb for usage, you can use: -```shell -pip install pywb -``` +``pip install pywb`` Note: depending on your Python installation, you may have to use `pip3` instead of `pip`. @@ -70,9 +68,7 @@ Note: depending on your Python installation, you may have to use `pip3` instead Installation from local copy ---------------------------- -```shell -git clone https://github.com/webrecorder/pywb -``` +``git clone https://github.com/webrecorder/pywb`` To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``. From f40e7ef18c46c6e06e6c45f39ccd75d7ca7c80d5 Mon Sep 17 00:00:00 2001 From: kuechensofa <89413714+kuechensofa@users.noreply.github.com> Date: Thu, 23 Nov 2023 18:10:52 +0100 Subject: [PATCH 6/6] Sort index when adding wacz archives (#820) --- pywb/manager/manager.py | 62 ++++++++++++++++++++++------------------- tests/test_manager.py | 5 ++++ 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 40a8bef8f..d983c726a 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -12,7 +12,7 @@ from pkg_resources import resource_string, get_distribution from argparse import ArgumentParser, RawTextHelpFormatter -from tempfile import mkdtemp +from tempfile import mkdtemp, TemporaryDirectory from zipfile import ZipFile from pywb.utils.loaders import load_yaml_config @@ -213,35 +213,35 @@ def _add_wacz_uncompressed(self, wacz): # delete temporary files shutil.rmtree(temp_dir) - @staticmethod - def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping): + def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping): from pywb.warcserver.index.cdxobject import CDXObject - # copy collection index to temporary directory - tempdir = mkdtemp() - collection_index_name = os.path.basename(collection_index_path) - collection_index_temp_path = os.path.join(tempdir, collection_index_name) - - if os.path.exists(collection_index_path): - shutil.copy2(collection_index_path, collection_index_temp_path) + # rewrite wacz index to temporary index file + tempdir = TemporaryDirectory() + wacz_index_name = os.path.basename(wacz_index_path) + rewritten_index_path = os.path.join(tempdir.name, wacz_index_name) - with open(collection_index_temp_path, 'a') as collection_index_temp_file: + with open(rewritten_index_path, 'w') as rewritten_index: if wacz_index_path.endswith('.gz'): - wacz_index_file = gzip.open(wacz_index_path, 'rb') + wacz_index = gzip.open(wacz_index_path, 'rb') else: - wacz_index_file = open(wacz_index_path, 'rb') - collection_index_temp_file.write('\n') - for line in wacz_index_file.readlines(): + wacz_index = open(wacz_index_path, 'rb') + + for line in wacz_index: cdx_object = CDXObject(cdxline=line) if cdx_object['filename'] in filename_mapping: cdx_object['filename'] = filename_mapping[cdx_object['filename']] - collection_index_temp_file.write(cdx_object.to_cdxj()) + rewritten_index.write(cdx_object.to_cdxj()) + + if not os.path.isfile(collection_index_path): + shutil.move(rewritten_index_path, collection_index_path) + return - wacz_index_file.close() + temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now() + self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path) + shutil.move(temp_coll_index_path, collection_index_path) - # copy temporary index back to original location and delete temporary directory - shutil.move(collection_index_temp_path, collection_index_path) - shutil.rmtree(tempdir) + tempdir.cleanup() def reindex(self): cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) @@ -294,20 +294,24 @@ def _index_merge_warcs(self, new_warcs, index_file, rel_root=None): merged_file = temp_file + '.merged' - last_line = None - - with open(cdx_file, 'rb') as orig_index: - with open(temp_file, 'rb') as new_index: - with open(merged_file, 'w+b') as merged: - for line in heapq.merge(orig_index, new_index): - if last_line != line: - merged.write(line) - last_line = line + self._merge_indices(cdx_file, temp_file, merged_file) shutil.move(merged_file, cdx_file) #os.rename(merged_file, cdx_file) os.remove(temp_file) + @staticmethod + def _merge_indices(index1, index2, dest): + last_line = None + + with open(index1, 'rb') as index1_f: + with open(index2, 'rb') as index2_f: + with open(dest, 'wb') as dest_f: + for line in heapq.merge(index1_f, index2_f): + if last_line != line: + dest_f.write(line) + last_line = line + def set_metadata(self, namevalue_pairs): metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml') metadata = None diff --git a/tests/test_manager.py b/tests/test_manager.py index 285e64f8e..c960674bb 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -75,10 +75,15 @@ def test_merge_wacz_index(self, tmp_path): {'example.warc.gz': 'rewritten.warc.gz'}) with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: index_content = f.read() + index_content = index_content.strip() assert 'example.warc.gz' not in index_content assert 'rewritten.warc.gz' in index_content + # check that collection index is sorted + index_lines = index_content.split('\n') + assert sorted(index_lines) == index_lines + def test_merge_wacz_index_gzip(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),