Skip to content

Commit

Permalink
Merge branch 'main' into upgrade-dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
tw4l authored Mar 27, 2024
2 parents 0bdc968 + f40e7ef commit bfbb4ab
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 40 deletions.
8 changes: 2 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,15 @@ Installation for Deployment

To install pywb for usage, you can use:

```shell
pip install pywb
```
``pip install pywb``

Note: depending on your Python installation, you may have to use `pip3` instead of `pip`.


Installation from local copy
----------------------------

```shell
git clone https://github.com/webrecorder/pywb
```
``git clone https://github.com/webrecorder/pywb``

To install from a locally cloned copy, install with ``pip install -e .`` or ``python setup.py install``.

Expand Down
4 changes: 3 additions & 1 deletion pywb/apps/frontendapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,10 +667,12 @@ def handle_request(self, environ, start_response):
# store original script_name (original prefix) before modifications are made
environ['ORIG_SCRIPT_NAME'] = environ.get('SCRIPT_NAME')

lang = args.pop('lang', self.default_locale)
lang = args.pop('lang', '')
if lang:
shift_path_info(environ)
environ['pywb_lang'] = lang
elif self.default_locale:
environ['pywb_lang'] = self.default_locale

response = endpoint(environ, **args)

Expand Down
62 changes: 33 additions & 29 deletions pywb/manager/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pkg_resources import resource_string, get_distribution

from argparse import ArgumentParser, RawTextHelpFormatter
from tempfile import mkdtemp
from tempfile import mkdtemp, TemporaryDirectory
from zipfile import ZipFile

from pywb.utils.loaders import load_yaml_config
Expand Down Expand Up @@ -213,35 +213,35 @@ def _add_wacz_uncompressed(self, wacz):
# delete temporary files
shutil.rmtree(temp_dir)

@staticmethod
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
def _add_wacz_index(self, collection_index_path, wacz_index_path, filename_mapping):
from pywb.warcserver.index.cdxobject import CDXObject

# copy collection index to temporary directory
tempdir = mkdtemp()
collection_index_name = os.path.basename(collection_index_path)
collection_index_temp_path = os.path.join(tempdir, collection_index_name)

if os.path.exists(collection_index_path):
shutil.copy2(collection_index_path, collection_index_temp_path)
# rewrite wacz index to temporary index file
tempdir = TemporaryDirectory()
wacz_index_name = os.path.basename(wacz_index_path)
rewritten_index_path = os.path.join(tempdir.name, wacz_index_name)

with open(collection_index_temp_path, 'a') as collection_index_temp_file:
with open(rewritten_index_path, 'w') as rewritten_index:
if wacz_index_path.endswith('.gz'):
wacz_index_file = gzip.open(wacz_index_path, 'rb')
wacz_index = gzip.open(wacz_index_path, 'rb')
else:
wacz_index_file = open(wacz_index_path, 'rb')
collection_index_temp_file.write('\n')
for line in wacz_index_file.readlines():
wacz_index = open(wacz_index_path, 'rb')

for line in wacz_index:
cdx_object = CDXObject(cdxline=line)
if cdx_object['filename'] in filename_mapping:
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
collection_index_temp_file.write(cdx_object.to_cdxj())
rewritten_index.write(cdx_object.to_cdxj())

if not os.path.isfile(collection_index_path):
shutil.move(rewritten_index_path, collection_index_path)
return

wacz_index_file.close()
temp_coll_index_path = collection_index_path + '.tmp.' + timestamp20_now()
self._merge_indices(collection_index_path, rewritten_index_path, temp_coll_index_path)
shutil.move(temp_coll_index_path, collection_index_path)

# copy temporary index back to original location and delete temporary directory
shutil.move(collection_index_temp_path, collection_index_path)
shutil.rmtree(tempdir)
tempdir.cleanup()

def reindex(self):
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
Expand Down Expand Up @@ -294,20 +294,24 @@ def _index_merge_warcs(self, new_warcs, index_file, rel_root=None):

merged_file = temp_file + '.merged'

last_line = None

with open(cdx_file, 'rb') as orig_index:
with open(temp_file, 'rb') as new_index:
with open(merged_file, 'w+b') as merged:
for line in heapq.merge(orig_index, new_index):
if last_line != line:
merged.write(line)
last_line = line
self._merge_indices(cdx_file, temp_file, merged_file)

shutil.move(merged_file, cdx_file)
#os.rename(merged_file, cdx_file)
os.remove(temp_file)

@staticmethod
def _merge_indices(index1, index2, dest):
last_line = None

with open(index1, 'rb') as index1_f:
with open(index2, 'rb') as index2_f:
with open(dest, 'wb') as dest_f:
for line in heapq.merge(index1_f, index2_f):
if last_line != line:
dest_f.write(line)
last_line = line

def set_metadata(self, namevalue_pairs):
metadata_yaml = os.path.join(self.curr_coll_dir, 'metadata.yaml')
metadata = None
Expand Down
6 changes: 3 additions & 3 deletions pywb/rewrite/templateview.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def switch_locale(context, locale):

request_uri = environ.get('REQUEST_URI', environ.get('PATH_INFO'))

if curr_loc:
if curr_loc and request_uri.startswith('/' + curr_loc + '/'):
return request_uri.replace(curr_loc, locale, 1)

app_prefix = environ.get('pywb.app_prefix', '')
Expand All @@ -196,11 +196,11 @@ def get_locale_prefixes(context):
orig_prefix = environ.get('pywb.app_prefix', '')
coll = environ.get('SCRIPT_NAME', '')

if orig_prefix:
if orig_prefix and coll.startswith(orig_prefix):
coll = coll[len(orig_prefix):]

curr_loc = environ.get('pywb_lang', '')
if curr_loc:
if curr_loc and coll.startswith('/' + curr_loc):
coll = coll[len(curr_loc) + 1:]

for locale in loc_map.keys():
Expand Down
2 changes: 1 addition & 1 deletion pywb/templates/error.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
{% block body %}
<div class="container text-danger error">
<div class="row justify-content-center">
<h2 class="display-2">Pywb Error</h2>
<h2 class="display-2">{{ _('Pywb Error') }}</h2>
</div>
<div class="row">
<div class="col-12 text-center">
Expand Down
5 changes: 5 additions & 0 deletions tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,15 @@ def test_merge_wacz_index(self, tmp_path):
{'example.warc.gz': 'rewritten.warc.gz'})
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
index_content = f.read()
index_content = index_content.strip()

assert 'example.warc.gz' not in index_content
assert 'rewritten.warc.gz' in index_content

# check that collection index is sorted
index_lines = index_content.split('\n')
assert sorted(index_lines) == index_lines

def test_merge_wacz_index_gzip(self, tmp_path):
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
Expand Down

0 comments on commit bfbb4ab

Please sign in to comment.