- DO NOT USE
cp -a
copy files with xattrs!
INSTEAD usersync -X -u -v
. \cp
does not remove absent fields from xattrs of the file previously occupying that name! OH NO (is this acp
bug!?)
source ~/files/venvs/sparcur-dev/bin/activate
python -m sparcur.simple.combine
echo Export complete. Check results at:
echo https://cassava.ucsd.edu/sparc/preview/archive/summary/$(readlink ~/.local/share/sparcur/export/summary/618*/LATEST)
One or more values for dataset_id
can be provided as uuid
, N:dataset:uuid
, or dataset:uuid
.
docker exec --user 836 -it $(docker ps -lqf ancestor=tgbugs/musl:sparcron-user) pypy3 -m sparcur.sparcron.rerun ${dataset_id}
See example in dockerfiles source for more.
function fetch-and-run-reports () {
local FN="/tmp/curation-export-$(date -Is).json"
curl https://cassava.ucsd.edu/sparc/preview/exports/curation-export.json -o "${FN}"
spc sheets update Organs --export-file "${FN}"
spc report all --sort-count-desc --to-sheets --export-file "${FN}"
}
fetch-and-run-reports
spc report changes \
--ttl-file https://cassava.ucsd.edu/sparc/preview/archive/exports/2021-05-25T125039,817048-0700/curation-export.ttl \
--ttl-compare https://cassava.ucsd.edu/sparc/preview/archive/exports/2021-05-24T141309,920776-0700/curation-export.ttl
spc report completeness
spc server --latest --count
keywords = sorted(set([k for d in asdf['datasets'] if 'meta' in d and 'keywords' in d['meta']
for k in d['meta']['keywords']]))
tar
is the only one of the ‘usual’ suspects for file archiving that
supports xattrs, zip
cannot.
tar --force-local --xattrs -cvzf 2019-07-17T10\:44\:16\,457344.tar.gz '2019-07-17T10:44:16,457344/'
tar --force-local --xattrs -xvzf 2019-07-17T10\:44\:16\,457344.tar.gz
find 2019-07-17T10\:44\:16\,457344 -exec getfattr -d {} \;
tar -cvf --zstd asdf.zst asdf
-rw-r--r-- 1 sparc sparc 95M Jan 21 19:50 2023-01-20T123933,576391-0800.tar.gz -rw-r--r-- 1 sparc sparc 60M Jan 21 19:43 2023-01-20T123933,576391-0800.tar.xz -rw-r--r-- 1 sparc sparc 82M Jan 21 19:49 2023-01-20T123933,576391-0800.tar.zst
so … zstd is faster than gzip by a wide margin … like … way faster xz still wins on the ratio though, for background stuff that isn’t time sensitive, better to go higher ratio
Manually remove the echo after checking that you are removing what you expect.
pushd /var/www/sparc/sparc/
pushd archive/exports
find -maxdepth 1 -not -path '.' -type d -exec tar -cvJf '{}.tar.xz' '{}' \;
chown nginx:nginx *.tar.xz
# remove all but the one currently symlinked to exports
find -maxdepth 1 -not -path '.' -not -path "*$(basename $(readlink ../../exports))*" -type d -exec echo rm -r '{}' \;
popd
pushd preview/archive/summary
XZ_OPT=-e9 find -maxdepth 1 -not -path '.' -type d -newer $(ls -At *.tar.xz | head -n 1) -exec echo tar -cvJf '{}.tar.xz' '{}' \;
chown nginx:nginx *.tar.xz
# remove previous years
find -maxdepth 1 -not -path '.' -not -path "*$(date +%Y)-*" -type d -exec echo rm -r '{}' \+
# remove all the but most recent 4 folders
find -maxdepth 1 -not -path '.' -type d | sort -u | head -n -4 | xargs echo rm -r
popd
pushd /path/to/backup
rsync -z -v -r --links -e ssh cassava:/var/www/sparc sparc-$(date -I)
# export is big, so probably only keep two of these around, current and previous, copy current locally and then rsync into it again
rsync -z -v -r --links -e ssh cassava-sparc:/var/lib/sparc/.local/share/sparcur/export sparcur-export-$(date -I)
pushd /path/to/backup
pushd sparc-*/sparc/archive/exports
find -maxdepth 1 -not -path '.' -type d -exec tar -cvJf '{}.tar.xz' '{}' \;
find -maxdepth 1 -not -path '.' -type d -exec rm -r '{}' \;
popd
pushd sparc-*/sparc/preview/archive/exports
find -maxdepth 1 -not -path '.' -type d -exec tar -cvJf '{}.tar.xz' '{}' \;
find -maxdepth 1 -not -path '.' -type d -exec rm -r '{}' \;
popd
function sparc-copy-pull () {
: ${SPARC_PARENT:=${HOME}/files/blackfynn_local/}
local TODAY=$(date +%Y%m%d)
pushd ${SPARC_PARENT} &&
mv SPARC\ Consortium "SPARC Consortium_${TODAY}" &&
rsync -ptgo -A -X -d --no-recursive --exclude=* "SPARC Consortium_${TODAY}/" SPARC\ Consortium &&
mkdir SPARC\ Consortium/.operations &&
mkdir SPARC\ Consortium/.operations/trash &&
rsync -X -u -v -r "SPARC Consortium_${TODAY}/.operations/objects" SPARC\ Consortium/.operations/ &&
pushd SPARC\ Consortium &&
spc pull || echo "spc pull failed"
popd
popd
}
jq -r '[ .datasets[] |
{id: .id,
name: .meta.folder_name,
se: [ .status.submission_errors[].message ] | unique,
ce: [ .status.curation_errors[].message ] | unique } ]' curation-export.json
Get a list of all file extensions.
find -type l -o -type f | grep -o '\(\.[a-zA-Z0-9]\+\)\+$' | sort -u
Arbitrary information about a dataset with files matching a pattern.
The example here gives ids for all datasets that contain xml files.
Nesting find -exec
does not work so the first pattern here uses shell
globing to get the datasets.
function datasets-matching () {
for d in */; do
find "$d" \( -type l -o -type f \) -name "*.$1" \
-exec getfattr -n user.bf.id --only-values "$d" \; -printf '\n' -quit ;
done
}
Fetch files that have zero size (indication that fetch is broken).
find -type f -name '*.xml' -empty -exec spc fetch {} \+
find -type d -not -name 'ephys' -name 'ses-*' -exec bash -c \
'pushd $1 1>/dev/null; pwd >> ~/manifest-stuff.txt; spc report size --tab-table ./* >> ~/manifest-stuff.txt; popd 1>/dev/null' _ {} \;
This one is fairly slow, but is almost certainly i/o limited due to having to read the xattrs. Maintaining the backup database of the mappings would make this much faster.
# folders and files
find . -not -type l -not -path '*operations*' -exec getfattr -n user.bf.id --only-values {} \; -print
# broken symlink format, needs work, hard to parse
find . -type l -not -path '*operations*' -exec readlink -n {} \; -print
for d in */; do printf "$(find "${d}" -print | wc -l) "; printf "$(getfattr --only-values -n user.bf.id "${d}") ${d}\n" ; done | sort -n
Until we fix compound units parsing for the round trip we might
accidentally encounter and error along the lines of
ValueError: Unit expression cannot have a scaling factor.
jq -C '.. | .units? // empty' /tmp/curation-export-*.json | sort -u
pushd ~/.cache/idlib
mv protocol_json protocol_json-old
# run export
find protocol_json -size -2 -exec cat {} \+
# check to make sure that there weren't any manually provided caches
find protocol_json -size -2 -execdir cat ../protocol_json-old/{} \;
SPARC Consortium
folders that you want to nuke.
find -maxdepth 1 -type d -not -name '.operations' -not -name '.' -exec rm -r {} \;
from sparcur.paths import Path
here = Path.cwd()
here = Path('/var/lib/sparc/files/sparc-datasets-test')
bs = [
rc
for c in here.children
for rd in (c / 'SPARC Consortium' / '.operations' / 'temp-upstream').rchildren_dirs
for rc in rd.children
if rc.is_broken_symlink()]
_ = [b.unlink() for b in bs]
We already unlink the broken symlinks after completing the swap when we pull a fresh copy, however when there are lots of changes the folders themselves start to add up, so consider cleaning those out too since mainly the use case is to keep a record of the metadata files from a given date.
pushd ~/files/sparc-datasets-test
# list empty directories in temp-upstream add -delete to remove them
find */SPARC/.operations/temp-upstream -type d -empty
This preserves the old .operations
folders in SPARC Consortium
for the time being.
pushd ~/files/sparc-datasets-test
# shows how much space can be recovered by removing old dataset folders
find */SPARC*/ -mindepth 1 -maxdepth 1 -type d -not -name '.operations' -not -exec sh -c 'test "$(readlink "${1}"/../../dataset)" = "${1#*/}"' _ {} \; -exec du -hd0 --total {} \+ | sort -h
# sh -c 'echo "${1#*/}"' _ {} \; # remove everything before the first slash
# actually delete, DO NOT PASTE THE OUTPUT TO RUN!!! there are spaces !!! remove the echo '#' bit
find */SPARC*/ -mindepth 1 -maxdepth 1 -type d -not -name '.operations' -not -exec sh -c 'test "$(readlink "${1}"/../../dataset)" = "${1#*/}"' _ {} \; -exec echo '#' rm -rf {} \;
Check to see if objects already exist in SPARC/.operations/objects
and if not move them there from SPARC Consortium/.operations/objects
leaving duplicates to be removed.
pushd ~/files/sparc-datasets-test
# list files that would be moved because they are not in SPARC/.objects
find */SPARC\ Consortium/.operations/objects -type f -not -exec bash -c 'target="${1/\ Consortium}"; test -f "${target}"' _ {} \; -print
# see the distribution of sizes for files that would be moved
find */SPARC\ Consortium/.operations/objects -type f -not -exec bash -c 'target="${1/\ Consortium}"; test -f "${target}"' _ {} \; -exec ls -alhS {} \+
# actually move the files, when running for real remove the echos AGAIN --- DO NOT PASTE
find */SPARC\ Consortium/.operations/objects -type f -exec bash -c 'target="${1/\ Consortium}"; test -f "${target}" || { echo mkdir -p "${target%/*}" && echo mv "${1}" "${target}"; }' _ {} \;
# internal consistency check (usually detects issues coming from upstream)
find */SPARC/.operations/objects -type f -exec spc meta --only-diff {} \+
# see if there are any cases where the files are not the same
find */SPARC\ Consortium/.operations/objects -type f -not -exec bash -c 'target="${1/\ Consortium}"; test -f "${target}" && { sha256sum "${1}" | sed "s/ Consortium//" | sha256sum --check --status; } ' _ {} \; -print
# delete files where the target exists (make sure all files are actually identical) change -print to -delete when ready to go for real
find */SPARC\ Consortium/.operations/objects -type f -exec bash -c 'target="${1/\ Consortium}"; test -f "${target}"' _ {} \; -print
pushd ~/files/sparc-datasets-test
# get modified dates for all consort variants
find -mindepth 3 -maxdepth 3 -path '*SPARC\ Consortium*' -not -name '.operations' -exec ls -alhtrd {} \+
# SPARC Consortium only cases
find -maxdepth 2 -type d -name 'SPARC*' | sort -u | grep Consort -B1 | grep -v -- '--' | sort | cut -d\/ -f2 | uniq -u | xargs -I[] find [] -mindepth 1 -maxdepth 1 -type d | grep Consort | cut -d\/ -f1
# no consort cases
find -maxdepth 2 -type d -name 'SPARC*' | sort -u | grep Consort -B1 | grep -v -- '--' | sort | cut -d\/ -f2 | uniq -u | xargs -I[] find [] -mindepth 1 -maxdepth 1 -type d | grep -v Consort | cut -d\/ -f1
# only both SPARC and SPARC Consortium cases
find -mindepth 2 -maxdepth 2 -type d -name 'SPARC' -exec test -d {}/../SPARC\ Consortium \; -exec ls {}/.. \;
# get sizes of the consort folders
find -mindepth 2 -maxdepth 2 -type d -name 'SPARC' -exec test -d {}/../SPARC\ Consortium \; -exec du -hd0 {}\ Consortium \; | sort -h
find */SPARC\ Consortium/.operations/temp-upstream/ -type d -name '*-ERROR' | cut -d'/' -f 1 | sort -u
python -m sparcur.simple.retrieve --jobs 1 --sparse-limit -1 --parent-parent-path . --dataset-id $1
pushd $1
spc export
pushd ~/.cache/log/sparcur/datasets
find -name stdout.log -printf "%T@ %Tc %p\n" | sort -n
less -R $_some_path
from sparcur.datasets import Tabular
from sparcur.paths import Path
p = Path('dataset_description.xlsx')
t = Tabular(p)
hrm1 = list(t.xlsx1())
hrm2 = list(t.xlsx2())
look for \ufeff
at the start of strings and then use e.g. vim to
open and edit the file removing it from the offending strings
Run the function, paste in the ids under failed and hit enter.
function review-failed () {
local paths _id
paths=()
while read _id; do
paths+=(~/.cache/log/sparcur/datasets/${_id}/LATEST/stdout.log)
if [ -z $_id ]; then break; fi
done
less -R ${paths[@]}
}
From curl instead of paste.
function review-failed-curl () {
local paths
paths=()
for _id in ${@} ; do
paths+=(~/.cache/log/sparcur/datasets/${_id}/LATEST/stdout.log)
if [ -z $_id ]; then break; fi
done
less -R ${paths[@]}
}
Usage.
review-failed-curl $(curl https://cassava.ucsd.edu/sparc/pipelines/failed | jq -r '.failed[]' | sed 's/N:dataset://')
find -type f -exec sh -c '[[ "$(getfattr -d $1)" = "" ]] || exit 1' _ {} \; -exec python -m sparcur.cli meta --fake --meta-from-local {} \+
See the developer guide section on SCKAN.
Have to clone SODA and fetch the files for testing.
from pprint import pprint
import pysoda
from sparcur.paths import Path
p = Path(parent_folder, path).expanduser().resolve()
children = list(p.iterdir())
blob = pysoda.create_folder_level_manifest(
{p.resolve().name: children},
{k.name + '_description': ['some description'] * len(children)
for k in [p] + list(p.iterdir())})
manifest_path = Path(blob[p.name][-1])
manifest_path.xopen()
pprint(manifest_path)
See also the sparcur developer guild
Clean up existing files.
pushd ~/git/sparc-curation/resources
pypy3 -m sparcur.simple.clean_metadata_files --for-template clean --cleaned-output-path dt_clean DatasetTemplate
cp dt_cleaned/*.xlsx DatasetTemplate/
Commit any changes and push to master.
Generate diffs against the previous tag and then view with less
.
pushd ~/git/CLEANROOM/sparc-curation/resources/DatasetTemplate
[ -d ../csvs ] || mkdir ../csvs
[ -d ../diffs ] || mkdir ../diffs
lasttag=$(git tag --sort=taggerdate --list dataset-template* | tail -n1)
dtver="${lasttag##*-}"
for f in $(ls *.xlsx); do
git show dataset-template-3.0.0:resources/DatasetTemplate/"${f}" | xlsx2csv - ../csvs/"${f%%.*}-${dtver}.csv"
xlsx2csv "${f}" ../csvs/"${f%%.*}.csv"
git diff --word-diff --word-diff-regex=. --no-index --color=always -- ../csvs/"${f%%.*}-${dtver}.csv" ../csvs/"${f%%.*}.csv" > ../diffs/"${f%%.*}.patch"
done
# less ../diffs/*
popd
make-template-zip () {
template_type="${1}"
local CLEANROOM=/tmp/cleanroom/
mkdir ${CLEANROOM} || return 1
pushd ${CLEANROOM}
git clone https://github.com/SciCrunch/sparc-curation.git &&
pushd ${CLEANROOM}/sparc-curation/resources
# TODO path to spec file for various templates (see sparcur.simple.clean_metadata_files and datasets.Tabular._openpyxl_fixes)
python -m sparcur.simple.clean_metadata_files --for-template ${template_type} --cleaned-output-path "dt_${template_type}" DatasetTemplate
cp "dt_${template_type}"/*.xlsx DatasetTemplate/
zip -x '*.gitkeep' -x '*/curation.xlsx' -x '*/aux*' -r DatasetTemplate.zip DatasetTemplate
mv DatasetTemplate.zip ${CLEANROOM}
popd
rm -rf ${CLEANROOM}/sparc-curation
popd
}
make-template-zip default
Once that is done open /tmp/cleanroom/DatasetTemplate.zip in file-roller
or similar
and make sure everything is as expected.
Create the GitHub release. The tag name should have the format dataset-template-1.1
where
the version number should match the metadata version embedded in
dataset_description.xlsx.
Minor versions such as dataset-template-1.2.1
are allowed.
Attach ${CLEANROOM}/DatasetTemplate.zip
as a release asset.
Inform curation so they can notify the community.
Useinspect.getclasstree
along with pyontutils.utils.subclasses
to display hierarchies of classes.
from inspect import getclasstree
from pyontutils.utils import subclasses
from IPython.lib.pretty import pprint
# classes to inspect
import pathlib
from sparcur import paths
def class_tree(root):
return getclasstree(list(subclasses(root)))
pprint(class_tree(pathlib.PurePosixPath))
less
.
less -R $(ls -d ~sparc/files/blackfynn_local/export/log/* | tail -n 1)
For a permanent fix for less
add
alias less='less -R'
maybe_size = c.cache.meta.size # << AttributeError here
Modify to wrap code
try:
maybe_size = c.cache.meta.size
except AttributeError as e:
breakpoint() # << investigate error
Temporary squash by logging as an exception with optional explanation
try:
maybe_size = c.cache.meta.size
except AttributeError as e:
log.exception(e)
log.error(f'explanation for error and local variables {c}')