From 22f20e64905c29f02734b92df2dd656d41c1b434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Mon, 12 Nov 2018 20:32:12 -0500 Subject: [PATCH 1/4] support linking files in place instead of copying This allows users to manage collections of large WARC files without duplicating space. Hardlinks are used instead of symlinks to reflect the original mechanism, where the file is copied (so it can be safely removed from the source). If we used symlinks, we would break that expectation which could lead to data loss. Inversely, hardlinks can lead to data loss as well. For example, pywb could somehow edit the file, which would modify the original as well. But we assume here pywb does not modify the file, and each side of the hardlink can have their own permissions to ensure this (or not) as well. Closes: #408 --- pywb/manager/manager.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 04fe90f2a..98fcd6c15 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -108,7 +108,7 @@ def _assert_coll_exists(self): 'To create a new collection, run\n\n{1} init {0}') raise IOError(msg.format(self.coll_name, sys.argv[0])) - def add_warcs(self, warcs): + def add_warcs(self, warcs, hardlink=False): if not os.path.isdir(self.archive_dir): raise IOError('Directory {0} does not exist'. format(self.archive_dir)) @@ -116,9 +116,16 @@ def add_warcs(self, warcs): full_paths = [] for filename in warcs: filename = os.path.abspath(filename) - shutil.copy2(filename, self.archive_dir) + if hardlink: + os.link(filename, os.path.join(self.archive_dir, + os.path.basename(filename))) + else: + shutil.copy2(filename, self.archive_dir) full_paths.append(os.path.join(self.archive_dir, filename)) - logging.info('Copied ' + filename + ' to ' + self.archive_dir) + logging.info('%s %s to %s', + hardlink and 'Linked' or 'Copied', + filename, + self.archive_dir) self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) @@ -357,12 +364,14 @@ def do_list(r): # Add Warcs def do_add(r): m = CollectionsManager(r.coll_name) - m.add_warcs(r.files) + m.add_warcs(r.files, r.hardlink) addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex' addwarc = subparsers.add_parser('add', help=addwarc_help) addwarc.add_argument('coll_name') addwarc.add_argument('files', nargs='+') + addwarc.add_argument('--hardlink', '-l', action='store_true', + help='hardlink files into storage instead of copying') addwarc.set_defaults(func=do_add) # Reindex All From 30a944256824eefed9428fd4193b492345bc4f36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Mon, 12 Nov 2018 21:33:55 -0500 Subject: [PATCH 2/4] also support symlinks --- pywb/manager/manager.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 98fcd6c15..9def696b6 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -108,7 +108,7 @@ def _assert_coll_exists(self): 'To create a new collection, run\n\n{1} init {0}') raise IOError(msg.format(self.coll_name, sys.argv[0])) - def add_warcs(self, warcs, hardlink=False): + def add_warcs(self, warcs, method='copy'): if not os.path.isdir(self.archive_dir): raise IOError('Directory {0} does not exist'. format(self.archive_dir)) @@ -116,16 +116,19 @@ def add_warcs(self, warcs, hardlink=False): full_paths = [] for filename in warcs: filename = os.path.abspath(filename) - if hardlink: + logging.info('%s %s to %s', + method, + filename, + self.archive_dir) + if method == 'hardlink': os.link(filename, os.path.join(self.archive_dir, os.path.basename(filename))) + elif method == 'symlink': + os.symlink(filename, os.path.join(self.archive_dir, + os.path.basename(filename))) else: shutil.copy2(filename, self.archive_dir) full_paths.append(os.path.join(self.archive_dir, filename)) - logging.info('%s %s to %s', - hardlink and 'Linked' or 'Copied', - filename, - self.archive_dir) self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) @@ -364,13 +367,20 @@ def do_list(r): # Add Warcs def do_add(r): m = CollectionsManager(r.coll_name) - m.add_warcs(r.files, r.hardlink) + m.add_warcs(r.files, r.method) addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex' addwarc = subparsers.add_parser('add', help=addwarc_help) addwarc.add_argument('coll_name') addwarc.add_argument('files', nargs='+') - addwarc.add_argument('--hardlink', '-l', action='store_true', + addwarc.add_argument('--method', '-m', default='copy', + help='import method (default: %(default)s)', + choices=('copy', 'symlink', 'hardlink')) + addwarc.add_argument('--symlink', '-s', action='store_const', + dest='method', const='symlink', + help='symlink files into storage instead of copying') + addwarc.add_argument('--hardlink', '-l', action='store_const', + dest='method', const='hardlink', help='hardlink files into storage instead of copying') addwarc.set_defaults(func=do_add) From 5df94c01b3fa12f7f03bd4c2ddbd6ee53476ee2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Mon, 12 Nov 2018 21:17:21 -0500 Subject: [PATCH 3/4] sanity check: ensure a valid parameter is passed to add_warcs --- pywb/manager/manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 9def696b6..6d1e7328c 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -126,8 +126,10 @@ def add_warcs(self, warcs, method='copy'): elif method == 'symlink': os.symlink(filename, os.path.join(self.archive_dir, os.path.basename(filename))) - else: + elif method == 'copy': shutil.copy2(filename, self.archive_dir) + else: + raise NotImplementedError('unknown method name: %s' % method) full_paths.append(os.path.join(self.archive_dir, filename)) self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) From 011640af4993a0c923e7c8f3196de52e79797999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20Beaupr=C3=A9?= Date: Mon, 12 Nov 2018 21:17:33 -0500 Subject: [PATCH 4/4] cosmetic: capitalize method in output, following existing convention --- pywb/manager/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 6d1e7328c..5f8d49836 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -117,7 +117,7 @@ def add_warcs(self, warcs, method='copy'): for filename in warcs: filename = os.path.abspath(filename) logging.info('%s %s to %s', - method, + method.title(), filename, self.archive_dir) if method == 'hardlink':