Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add fixture duplication script #140

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
174 changes: 174 additions & 0 deletions scripts/duplicate_fixtures.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Script to use for creating a modified, non-conflicting duplicate of an existing database fixture
Use this after you have dumped some fixtures to a .json file using `dump_db_fixtures.py`, and now you
want another fixture set that has different UUIDs and identifiers so you can load both at once in a dev database
in order to get work done with them.

Input: .json file produced by dump_db_fixtures.py

Output: a new .json file with updated key values

Usage
-----
./duplicate_fixtures.py ca18b090-03ad-4bef-acd3-52600f8e62eb.run.full.json

Notes
-----
The input .json file must have been produced with 'indent = 4' or have equivalent indenting, or this script will probably not work
"""
import sys
import json
import uuid
import copy
import re
import time
import argparse

def get_unique_id():
"""
Get a unique ID
maybe replace with with uuid4?
uuid1 has a lower chance of duplicate id's being generated

TODO: see what Sinisa thinks about this later
"""
return(uuid.uuid1())

def replace_primary_keys(input_file):
"""
Replaces all primary keys in the file with new values
"""
# get all primary keys that need to be changed
all_pk = []
with open(input_file) as f:
for item in json.load(f):
# do not change the pk's on 'File' objects because that causes unique constraint issues on file path
if item['model'] != 'file_system.file':
all_pk.append(item['pk'])

# sort unique based on descending length
# this makes sure we replace the longer pattern first in the next steps
all_pk = sorted(list(set(all_pk)), key = len, reverse=True)

# load all lines of text from the file
lines = open(input_file).readlines()

# make a copy for editing
output_lines = copy.deepcopy(lines)

editted_lines = []

# search for every primary key in every line of text ... !! Yes, we are doing this
for old_pk in all_pk:
new_pk = str(get_unique_id())
for i, old_line in enumerate(lines):
if old_pk in old_line:
# I think this only replaces first intance of the pattern? shouldnt matter for this if you printed JSON with indent = 4
new_line = re.sub(old_pk, new_pk, output_lines[i])
output_lines[i] = new_line

# each line should only get editted one time doing this otherwise something is wrong
if i not in editted_lines:
editted_lines.append(i)
else:
print("ERROR: line {} is about to get editted twice; that is not supposed to happen something is wrong".format(i))
raise
return(output_lines)

def get_field_values(input_lines, field_name):
"""
Finds a list of all unique values for a given field in the file lines

examples:

"runId": "PITT_0390",
>>> ["PITT_0390"]

"requestId": "09670_D_1581808018",
"sampleId": "09670_D_1",
"patientId": "C-K2902H",
"sampleName": "C-K2902H-P001-d",
"externalSampleId": "S16-68609",
"investigatorSampleId": "S16-68609",
"""
all_values = set()
search_pattern = '.*"{field_name}": "(.*)"'.format(field_name = field_name)
for line in input_lines:
match = re.search(search_pattern, line)
if match != None:
value = match.group(1)
all_values.add(value)

# return reverse sorted on length to ensure sub-patterns do not get replaced first later
all_values = sorted(list(set(all_values)), key = len, reverse=True)
return(all_values)

def replace_field_value(input_lines, field_name, old_value, new_value):
"""
Replace the old value for a field with the new value in all file lines
"""
# make a copy for editing
output_lines = copy.deepcopy(input_lines)

fieldname_search_pattern = '"{field_name}":'.format(field_name = field_name)

# search for the field label in all lines and replace the value if found
for i, line in enumerate(input_lines):
# check that its a line with the desired field name in it
line_match = re.search(fieldname_search_pattern, line)
if line_match != None:
# check that the desired value to be changed is present
id_match = re.search(old_value, line)
if id_match != None:
# replace the old value with the new value
new_line = re.sub(old_value, new_value, output_lines[i])
output_lines[i] = new_line
return(output_lines)

def main(**kwargs):
"""
Main function for editing a fixtures file to replace all old primary keys with new keys
So that both old and new fixtures sets can be loaded into the database at the same time
"""
input_file = kwargs.pop('input_file')
output_file = kwargs.pop('output_file', None)

if output_file == None:
output_file_name = "{}.duplicated.json".format(input_file)
else:
output_file_name = output_file

# generate a timestamp string to use for new unique identifiers
timestamp_str = str(int(time.time()))

# replace all the primary keys with new values; need special handling for pk's because they do not always have a field label in the file
output_lines = replace_primary_keys(input_file)

# replace the values for all of these other desired fields; these fields are always clearly labeled in the file so they are easy to find
for field_name in ['runId', 'requestId', 'sampleId', 'patientId', 'sampleName', 'externalSampleId', 'investigatorSampleId']:
all_values = get_field_values(input_lines = output_lines, field_name = field_name)
for old_value in all_values:
# make a new value by appending the timestamp
new_value = old_value + '_' + timestamp_str
output_lines = replace_field_value(input_lines = output_lines, field_name = field_name, old_value = old_value, new_value = new_value)

# save the output files
with open(output_file_name, "w") as fout:
fout.writelines(output_lines)

def parse():
"""
Parses script args
Script arg parsing will go here as this script grows
"""
parser = argparse.ArgumentParser(description = 'Duplicate database fixtures that were previously dumped in json format with indentation')
parser.add_argument('input_file', help = "Input file containing fixtures")
parser.add_argument('--output-file', dest = "output_file", default = None, help = "Name of output file to write to")

args = parser.parse_args()
main(**vars(args))

if __name__ == '__main__':
parse()