Skip to content

Commit

Permalink
hapmerge converts alleles to uppercase allowing for easy and correct …
Browse files Browse the repository at this point in the history
…merging
  • Loading branch information
mnshgl0110 committed Jul 14, 2022
1 parent a99693a commit 2320275
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 20 deletions.
38 changes: 21 additions & 17 deletions syri/scripts/regAnno
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,29 @@ def readData(args, cwdPath):
except Exception as e:
print("ERROR: while trying to read ", fin, "Out.txt", e)
continue

annoData = finD.loc[finD[0] == "#"].copy().drop( [0,4],axis = 1).dropna(axis = 1 , how="all")
annoData = finD.loc[finD[0] == "#"].copy().drop([0, 4], axis=1).dropna(axis=1, how="all")
annoData["state"] = fin
annoData = annoData.reindex(columns = ["state"]+annoData.columns[:-1].tolist())
annoCoords = annoCoords.append(annoData.copy(), sort = True)
annoCoords[[2,3,6,7]] = annoCoords[[2,3,6,7]].astype("int")
colNames = ["SVtype","refChr","refStart","refEnd","qryChr","qryStart","qryEnd","ctxType","Genome"]
annoCoords.columns = colNames[:annoCoords.shape[1]]
annoCoords.sort_values(["refChr","refStart","refEnd","qryChr","qryStart","qryEnd"],inplace=True)
annoData = annoData.reindex(columns=["state"]+annoData.columns[:-1].tolist())
# annoCoords = annoCoords.append(annoData.copy(), sort=True)
annoCoords = pd.concat([annoCoords, annoData], sort=True)
annoCoords[[2, 3, 6, 7]] = annoCoords[[2, 3, 6, 7]].astype("int")
colnames = ["SVtype", "refChr", "refStart", "refEnd", "qryChr", "qryStart", "qryEnd", "ctxType", "Genome"]
annoCoords.columns = colnames[:annoCoords.shape[1]]
annoCoords.sort_values(["refChr", "refStart", "refEnd", "qryChr", "qryStart", "qryEnd"], inplace=True)
annoCoords.index = range(len(annoCoords))
return annoCoords
# END


def getRegion(args, cwdPath):
data = readData(args,cwdPath)
data = readData(args, cwdPath)
if not args.q:
regions = data.loc[(data.refChr==args.chr) & (data.refStart <= args.end) & (data.refEnd >= args.start)].copy()
regions = data.loc[(data.refChr == args.chr) & (data.refStart <= args.end) & (data.refEnd >= args.start)].copy()
elif args.q:
regions = data.loc[(data.qryChr==args.chr) & (data.qryStart <= args.end) & (data.qryEnd >= args.start)].copy()
regions.sort_values(["qryChr","qryStart","qryEnd","refChr","refStart","refEnd"],inplace=True )
print(regions.to_csv(sep="\t",index=False, header=False))
regions = data.loc[(data.qryChr == args.chr) & (data.qryStart <= args.end) & (data.qryEnd >= args.start)].copy()
regions.sort_values(["qryChr", "qryStart", "qryEnd", "refChr", "refStart", "refEnd"], inplace=True )
print(regions.to_csv(sep="\t", index=False, header=False))
# END


def getBed(args, cwdPath):
Expand All @@ -61,6 +64,7 @@ def getBed(args, cwdPath):
if args.a:
outD[["SVType","ctxType","genome"]] = data[["SVtype","ctxType","Genome"]]
print(outD.to_csv(sep="\t", index=False, header=False))
# END


def getSnpAnno(args, cwdPath):
Expand Down Expand Up @@ -95,9 +99,8 @@ def getSnpAnno(args, cwdPath):
outData["inChr"] = list(chroms)
outData["inPos"] = list(poses)
print(outData.to_csv(sep="\t", index=False,header=False), end = "")



# END


if __name__ == "__main__":
parser = argparse.ArgumentParser()
Expand All @@ -115,6 +118,7 @@ if __name__ == "__main__":
parser_region.add_argument("chr", help="Chromosome ID", type=str)
parser_region.add_argument("start", help="region start", type=int)
parser_region.add_argument("end", help="region end", type=int)
parser_region.add_argument("-p", help="prefix", type=str)
parser_region.add_argument("-q", help="search region in query genome", action="store_true", default=False)
parser_region.add_argument("-f", help="files to search", type=argparse.FileType('r'), nargs="+", default=None)

Expand All @@ -134,7 +138,7 @@ if __name__ == "__main__":
args = parser.parse_args()
if "f" in args:
if args.f == None:
args.f = list(map(open,['synOut.txt', 'invOut.txt', 'TLOut.txt', 'invTLOut.txt', 'dupOut.txt', 'invDupOut.txt', 'ctxOut.txt']))
args.f = list(map(open, ['synOut.txt', 'invOut.txt', 'TLOut.txt', 'invTLOut.txt', 'dupOut.txt', 'invDupOut.txt', 'ctxOut.txt']))

import pandas as pd
from collections import deque
Expand Down
10 changes: 7 additions & 3 deletions syri/scripts/vcfasm.py → syri/scripts/vcfasm
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def readvcf(v):
if var.chrom != lastchr:
if var.chrom not in cids:
cids.append(var.chrom)
vd[var.chrom].append((var.pos, var.ref, var.alts))
vd[var.chrom].append((var.pos, var.ref.upper(), tuple(map(str.upper, var.alts)))) # Convert bases to upper case to allow correct comparison
gp = deque() # garb pos
v1d = defaultdict(dict)
for k in vd:
Expand Down Expand Up @@ -238,7 +238,11 @@ def filindsize(args):
for var in v1.fetch():
if len(var.ref) == 1 and len(var.alts) == 1 and len(var.alts[0]) == 1:
vo.write(str(var))
elif imin <= abs(len(var.ref) - max([len(a) for a in var.alts])) <= imax:
elif not imin <= len(var.ref) <= imax:
continue
elif any([not imin <= len(a) <= imax for a in var.alts]):
continue
else:
vo.write(str(var))
logger.info(f'Finished filindsize SNPs/indels. Output saved in {outvcf}')
return 0
Expand All @@ -252,7 +256,7 @@ def filindsize(args):
# Define subparsers
parser_vcfshv = subparsers.add_parser("vcfshv", help="Filter VCF file from syri to select SNPs and indels", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_hapmerge = subparsers.add_parser("hapmerge", help="Merge VCF files corresponding two variantions in the haploid assemblies of a diploid organism. VCF should only have SNPs and indels before merging.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_filindsize = subparsers.add_parser("filindsize", help="Filter indels in a VCF file based on size. Currently, can only handle VCF having only SNPs and indels.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser_filindsize = subparsers.add_parser("filindsize", help="Filter indels in a VCF file based on size of ref/alt allele. Currently, can only handle VCF having only SNPs and indels.", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# Check whether any command is selected ####################################
if len(sys.argv[1:]) == 0:
parser.print_help()
Expand Down

0 comments on commit 2320275

Please sign in to comment.