From 54d9a15096414354d2f548d58bccbd9d8872f538 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 5 Aug 2024 11:02:36 -0700 Subject: [PATCH 01/27] adding draft script to fix reference --- scripts/fix-ref/README.md | 30 +++++++ .../fix_ensembletr_snpstr_reference.py | 79 +++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 scripts/fix-ref/README.md create mode 100755 scripts/fix-ref/fix_ensembletr_snpstr_reference.py diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md new file mode 100644 index 0000000..1dc7e2a --- /dev/null +++ b/scripts/fix-ref/README.md @@ -0,0 +1,30 @@ +# Clean up the EnsembleTR SNP-TR reference panel + +TODO: +* Remove duplicate records and those with incorrect reference allele +* Add required header line +* Add informative variant IDs to STRs +* Run script to add beagle info +* Release bref versions alongside the VCFs + +## Step 1: Fix reference VCF files + +``` +chrom=11 +./fix_ensembletr_snpstr_reference.py \ + --vcf chr${chrom}_final_SNP_merged_additional_TRs.vcf.gz \ + --ref Homo_sapiens_assembly38.fasta \ + --out ensembletr_refpanel_v3_chr${chrom} +``` + +## Step 2: Add beagle info + +``` +TODO +``` + +## Step 3: Convert to bref + +``` +TODO +``` \ No newline at end of file diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py new file mode 100755 index 0000000..0259ac9 --- /dev/null +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +""" +Script to clean up SNP/TR reference haplotype panels +""" + +import argparse +import cyvcf2 +import pyfaidx +import sys + +def IsTRRecord(record_id): + return record_id is None or record_id.strip() == "." + +def GetTRRecordID(record): + return "EnsTR:%s:%s"%(record.CHROM, record.POS) + +def CheckReference(record, refgenome): + # REF in VCF should match what is in the reference genome + refseq = refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)] + return (refseq == record.REF) + +def run(): + args = getargs() + if args == None: + sys.exit(1) + else: + retcode = main(args) + sys.exit(retcode) + +def getargs(): # pragma: no cover + parser = argparse.ArgumentParser(__doc__) + parser.add_argument("--vcf", help="Input SNP/TR VCF file", type=str, required=True) + parser.add_argument("--ref", help="Reference fasta file", type=str, required=True) + parser.add_argument("--out", help="Prefix for output VCF file", type=str, required=True) + parser.add_argument("--max-records", help="Quit after processing this many records (for debug)", \ + default=-1, type=int) + args = parser.parse_args() + return args + +def main(args): + # Set up VCF reader + reader = cyvcf2.Reader(args.vcf) + + # Set up the reference + refgenome = pyfaidx.Fasta(args.ref) + + # Set up writer, adding the missing header + reader.add_to_header("##command=hipstr;note this is a dummy header line") + writer = cyvcf2.Writer(args.out + ".vcf", reader) + + # Go through each record + # If SNP: just print it + # if STR: filter records with incorrect reference, + # and modify ID + num_records_processed = 0 + for record in reader: + num_records_processed += 1 + if args.max_records > 0 and num_records_processed > args.max_records: + break # for debug + if not IsTRRecord(record.ID): + writer.write_record(record) + else: + # Check reference + if not CheckReference(record, refgenome): + sys.stderr.write("Skipping record %s:%s:%s, bad ref sequence\n"%(record.ID,record.CHROM,record.POS)) + sys.stderr.write(" REF=%s\n"%record.REF) + sys.stderr.write(" Refseq=%s\n"%refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)]) + continue # skip this record + # Modify ID + record.ID = GetTRRecordID(record) + # Write to file + writer.write_record(record) + + reader.close() + writer.close() + +if __name__ == "__main__": + run() \ No newline at end of file From 386fb7cb0238791c75450803b3204fbd31404677 Mon Sep 17 00:00:00 2001 From: nichole Date: Tue, 6 Aug 2024 10:24:59 -0700 Subject: [PATCH 02/27] add convert to bref3 --- scripts/fix-ref/README.md | 8 +--- scripts/fix-ref/convert_to_bref3.sh | 28 +++++++++++++ .../fix_ensembletr_snpstr_reference.py | 40 ++++++++++++------- 3 files changed, 55 insertions(+), 21 deletions(-) create mode 100644 scripts/fix-ref/convert_to_bref3.sh diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 1dc7e2a..2be9aa0 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -17,13 +17,7 @@ chrom=11 --out ensembletr_refpanel_v3_chr${chrom} ``` -## Step 2: Add beagle info - -``` -TODO -``` - -## Step 3: Convert to bref +## Step 2: Convert to bref ``` TODO diff --git a/scripts/fix-ref/convert_to_bref3.sh b/scripts/fix-ref/convert_to_bref3.sh new file mode 100644 index 0000000..e6fa514 --- /dev/null +++ b/scripts/fix-ref/convert_to_bref3.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +ref=$1 +bref="bref3.27May24.118.jar" + + + +#bgzip and index ref +echo "bgzip and index ref" +bgzip ${ref}.vcf +tabix -p vcf ${ref}.vcf.gz + +#dowaload bref3.jar +# Check if the file exists locally +if [ ! -f "$bref" ]; then + echo "File does not exist. Downloading..." + wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar +else + echo "File already exists. Continuing..." +fi + +#convert vcf to bref3 +echo "converting vcf to bref3 format" +#zcat ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 +zcat ${ref}.vcf.gz +echo "Done converting" + + \ No newline at end of file diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 0259ac9..f4b9790 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python """ Script to clean up SNP/TR reference haplotype panels @@ -8,6 +8,9 @@ import cyvcf2 import pyfaidx import sys +import subprocess +import os + def IsTRRecord(record_id): return record_id is None or record_id.strip() == "." @@ -54,26 +57,35 @@ def main(args): # if STR: filter records with incorrect reference, # and modify ID num_records_processed = 0 + #fail_positions = {201312,225414,18455865} for record in reader: - num_records_processed += 1 - if args.max_records > 0 and num_records_processed > args.max_records: - break # for debug - if not IsTRRecord(record.ID): - writer.write_record(record) - else: + num_records_processed += 1 + if args.max_records > 0 and num_records_processed > args.max_records: + break # for debug + if not IsTRRecord(record.ID): + writer.write_record(record) + else: # Check reference - if not CheckReference(record, refgenome): - sys.stderr.write("Skipping record %s:%s:%s, bad ref sequence\n"%(record.ID,record.CHROM,record.POS)) - sys.stderr.write(" REF=%s\n"%record.REF) - sys.stderr.write(" Refseq=%s\n"%refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)]) - continue # skip this record + if not CheckReference(record, refgenome): + #if record.POS in fail_positions: + #print('bcftools merge fail detected %s'%record.POS) + #sys.stderr.write("Skipping record %s:%s:%s, bad ref sequence\n"%(record.ID,record.CHROM,record.POS)) + #sys.stderr.write(" REF=%s\n"%record.REF) + #sys.stderr.write(" Refseq=%s\n"%refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)]) + continue # skip this record # Modify ID - record.ID = GetTRRecordID(record) + record.ID = GetTRRecordID(record) # Write to file - writer.write_record(record) + writer.write_record(record) reader.close() writer.close() + + # Run bash script to convert to bref3 + cmd = "bash convert_to_bref3.sh {filename}".format(filename=args.out) + output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout.read() + print(output.decode("utf-8")) + if __name__ == "__main__": run() \ No newline at end of file From 08d52a2ac8aff77ee45f8c46021e365d5b5ffe5c Mon Sep 17 00:00:00 2001 From: nichole Date: Tue, 6 Aug 2024 10:51:59 -0700 Subject: [PATCH 03/27] add script to download hg38 ref panel --- scripts/fix-ref/download.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 scripts/fix-ref/download.sh diff --git a/scripts/fix-ref/download.sh b/scripts/fix-ref/download.sh new file mode 100644 index 0000000..f9ed154 --- /dev/null +++ b/scripts/fix-ref/download.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +#download and index hg38 ref panel +wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta +samtools faidx Homo_sapiens_assembly38.fasta + +#download bref3.jar file +wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar \ No newline at end of file From f963e562113f07ea57430c572c62fc8b9ed81317 Mon Sep 17 00:00:00 2001 From: nichole Date: Tue, 6 Aug 2024 12:01:39 -0700 Subject: [PATCH 04/27] fix bug --- scripts/fix-ref/convert_to_bref3.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/fix-ref/convert_to_bref3.sh b/scripts/fix-ref/convert_to_bref3.sh index e6fa514..0826682 100644 --- a/scripts/fix-ref/convert_to_bref3.sh +++ b/scripts/fix-ref/convert_to_bref3.sh @@ -21,8 +21,7 @@ fi #convert vcf to bref3 echo "converting vcf to bref3 format" -#zcat ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 -zcat ${ref}.vcf.gz +zcat ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 echo "Done converting" \ No newline at end of file From 2e93e6420d249808288bc840602f45d1ba928a4a Mon Sep 17 00:00:00 2001 From: nichole Date: Thu, 8 Aug 2024 10:20:05 -0700 Subject: [PATCH 05/27] add INFO field VT=OTHER/TR --- scripts/fix-ref/convert_to_bref3.sh | 2 +- scripts/fix-ref/fix_ensembletr_snpstr_reference.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/fix-ref/convert_to_bref3.sh b/scripts/fix-ref/convert_to_bref3.sh index 0826682..63e0d96 100644 --- a/scripts/fix-ref/convert_to_bref3.sh +++ b/scripts/fix-ref/convert_to_bref3.sh @@ -21,7 +21,7 @@ fi #convert vcf to bref3 echo "converting vcf to bref3 format" -zcat ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 +zcat < ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 echo "Done converting" \ No newline at end of file diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index f4b9790..017bc74 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -49,7 +49,8 @@ def main(args): refgenome = pyfaidx.Fasta(args.ref) # Set up writer, adding the missing header - reader.add_to_header("##command=hipstr;note this is a dummy header line") + reader.add_to_header('##command=hipstr;note this is a dummy header line') + reader.add_to_header('##INFO=') writer = cyvcf2.Writer(args.out + ".vcf", reader) # Go through each record @@ -63,6 +64,7 @@ def main(args): if args.max_records > 0 and num_records_processed > args.max_records: break # for debug if not IsTRRecord(record.ID): + record.INFO["VT"] = "OTHER" writer.write_record(record) else: # Check reference @@ -75,6 +77,7 @@ def main(args): continue # skip this record # Modify ID record.ID = GetTRRecordID(record) + record.INFO["VT"] = "TR" # Write to file writer.write_record(record) From f93bab68dafb38bba4c95e9ca14a196da00acdb9 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Wed, 14 Aug 2024 12:30:26 -0700 Subject: [PATCH 06/27] updating how we get locus IDs to accommodate duplicates --- scripts/fix-ref/README.md | 21 +++---- .../fix_ensembletr_snpstr_reference.py | 58 +++++++++++-------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 1dc7e2a..e9cdfc3 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -1,15 +1,14 @@ # Clean up the EnsembleTR SNP-TR reference panel -TODO: -* Remove duplicate records and those with incorrect reference allele -* Add required header line -* Add informative variant IDs to STRs -* Run script to add beagle info -* Release bref versions alongside the VCFs - ## Step 1: Fix reference VCF files +This step: +* Removes records with incorrect reference allele +* Adds required header line for TRTools +* Adds informative variant IDs to STRs + ``` +# Test on chr11 chrom=11 ./fix_ensembletr_snpstr_reference.py \ --vcf chr${chrom}_final_SNP_merged_additional_TRs.vcf.gz \ @@ -17,13 +16,7 @@ chrom=11 --out ensembletr_refpanel_v3_chr${chrom} ``` -## Step 2: Add beagle info - -``` -TODO -``` - -## Step 3: Convert to bref +## Step 2: Convert to bref ``` TODO diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 0259ac9..8723d9e 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -9,16 +9,26 @@ import pyfaidx import sys +MAX_ALLOWED_DUPS = 1000 + def IsTRRecord(record_id): - return record_id is None or record_id.strip() == "." + return record_id is None or record_id.strip() == "." -def GetTRRecordID(record): - return "EnsTR:%s:%s"%(record.CHROM, record.POS) +def GetTRRecordID(record, allids): + locid = "EnsTR:%s:%s"%(record.CHROM, record.POS) + if locid not in allids: + return locid + for i in range(1, MAX_ALLOWED_DUPS): + newlocid = "%s-%s"%(locid, i) + if newlocid not in allids: + sys.stderr.write("Adding duplicate locus %s\n"%newlocid) + return newlocid + raise ValueError("Error: too many duplicates of %s"%locid) def CheckReference(record, refgenome): - # REF in VCF should match what is in the reference genome - refseq = refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)] - return (refseq == record.REF) + # REF in VCF should match what is in the reference genome + refseq = refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)] + return (refseq == record.REF) def run(): args = getargs() @@ -34,7 +44,7 @@ def getargs(): # pragma: no cover parser.add_argument("--ref", help="Reference fasta file", type=str, required=True) parser.add_argument("--out", help="Prefix for output VCF file", type=str, required=True) parser.add_argument("--max-records", help="Quit after processing this many records (for debug)", \ - default=-1, type=int) + default=-1, type=int) args = parser.parse_args() return args @@ -53,24 +63,26 @@ def main(args): # If SNP: just print it # if STR: filter records with incorrect reference, # and modify ID + allids = set() num_records_processed = 0 for record in reader: - num_records_processed += 1 - if args.max_records > 0 and num_records_processed > args.max_records: - break # for debug - if not IsTRRecord(record.ID): - writer.write_record(record) - else: - # Check reference - if not CheckReference(record, refgenome): - sys.stderr.write("Skipping record %s:%s:%s, bad ref sequence\n"%(record.ID,record.CHROM,record.POS)) - sys.stderr.write(" REF=%s\n"%record.REF) - sys.stderr.write(" Refseq=%s\n"%refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)]) - continue # skip this record - # Modify ID - record.ID = GetTRRecordID(record) - # Write to file - writer.write_record(record) + num_records_processed += 1 + if args.max_records > 0 and num_records_processed > args.max_records: + break # for debug + if not IsTRRecord(record.ID): + writer.write_record(record) + else: + # Check reference + if not CheckReference(record, refgenome): + sys.stderr.write("Skipping record %s:%s:%s, bad ref sequence\n"%(record.ID,record.CHROM,record.POS)) + sys.stderr.write(" REF=%s\n"%record.REF) + sys.stderr.write(" Refseq=%s\n"%refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)]) + continue # skip this record + # Modify ID + record.ID = GetTRRecordID(record, allids) + allids.add(record.ID) + # Write to file + writer.write_record(record) reader.close() writer.close() From fd3daa6ba1abedb284bc5e182b166701fcb3066c Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 30 Aug 2024 21:21:35 -0700 Subject: [PATCH 07/27] checks to remove loci with too many or too few alleles --- scripts/fix-ref/README.md | 3 ++- scripts/fix-ref/fix_ensembletr_snpstr_reference.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index e9cdfc3..a1d8713 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -13,7 +13,8 @@ chrom=11 ./fix_ensembletr_snpstr_reference.py \ --vcf chr${chrom}_final_SNP_merged_additional_TRs.vcf.gz \ --ref Homo_sapiens_assembly38.fasta \ - --out ensembletr_refpanel_v3_chr${chrom} + --out ensembletr_refpanel_v3_chr${chrom} \ + --max-alleles 50 --min-alleles 2 ``` ## Step 2: Convert to bref diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 43432c5..86212ac 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -46,6 +46,8 @@ def getargs(): # pragma: no cover parser.add_argument("--vcf", help="Input SNP/TR VCF file", type=str, required=True) parser.add_argument("--ref", help="Reference fasta file", type=str, required=True) parser.add_argument("--out", help="Prefix for output VCF file", type=str, required=True) + parser.add_argument("--max-alleles", help="Ignore loci with more than this many alleles", type=int, default=-1) + parser.add_argument("--min-alleles", help="Ignore loci with fewer than this many alleles", type=int, default=-1) parser.add_argument("--max-records", help="Quit after processing this many records (for debug)", \ default=-1, type=int) args = parser.parse_args() @@ -80,6 +82,16 @@ def main(args): # Check reference if not CheckReference(record, refgenome): continue # skip this record + # Check if too many alleles + if args.max_alleles != -1 and (1+len(record.ALT)) > args.max_alleles: + sys.stderr.write("Skipping {}:{} with {} ALT alleles\n".format(record.CHROM, record.POS, len(record.ALT))) + continue + # Check if too few alleles + # Include cases where there is an ALT listed but the AF is 0 + if (args.min_alleles != -1 and (1+len(record.ALT)) < args.min_alleles) or \ + (record.INFO["AF"]==0): + sys.stderr.write("Skipping {}:{} with {} ALT alleles, AF={}\n".format(record.CHROM, record.POS, len(record.ALT), str(record.INFO["AF"]))) + continue # Modify ID record.ID = GetTRRecordID(record, allids) allids.add(record.ID) From 017c688211a30d115200039d8d05fa4acc444995 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Sat, 31 Aug 2024 08:52:16 -0700 Subject: [PATCH 08/27] update readme with description of fixes --- scripts/fix-ref/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index a1d8713..c30e894 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -6,6 +6,8 @@ This step: * Removes records with incorrect reference allele * Adds required header line for TRTools * Adds informative variant IDs to STRs +* Remove loci with too many or too few alleles +* Remove loci with AF=0 ``` # Test on chr11 From 4b376113e9033d910c78e5c1cb563d9b516b954c Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 08:29:46 -0700 Subject: [PATCH 09/27] updating print statements in fix ref script --- .../fix_ensembletr_snpstr_reference.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 86212ac..43e7799 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -2,6 +2,8 @@ """ Script to clean up SNP/TR reference haplotype panels + +TODO: remove alleles with AC=0 """ import argparse @@ -18,15 +20,15 @@ def IsTRRecord(record_id): return record_id is None or record_id.strip() == "." def GetTRRecordID(record, allids): - locid = "EnsTR:%s:%s"%(record.CHROM, record.POS) + locid = "EnsTR:{chrom}:{pos}"%.format(chrom=record.CHROM, pos=record.POS) if locid not in allids: return locid for i in range(1, MAX_ALLOWED_DUPS): - newlocid = "%s-%s"%(locid, i) + newlocid = "{locid}:{i}" if newlocid not in allids: - sys.stderr.write("Adding duplicate locus %s\n"%newlocid) + sys.stderr.write("Adding duplicate locus {newlocid}\n") return newlocid - raise ValueError("Error: too many duplicates of %s"%locid) + raise ValueError("Error: too many duplicates of {locid}") def CheckReference(record, refgenome): # REF in VCF should match what is in the reference genome @@ -82,15 +84,20 @@ def main(args): # Check reference if not CheckReference(record, refgenome): continue # skip this record + # TODO - remove alleles with AF=0 # Check if too many alleles if args.max_alleles != -1 and (1+len(record.ALT)) > args.max_alleles: - sys.stderr.write("Skipping {}:{} with {} ALT alleles\n".format(record.CHROM, record.POS, len(record.ALT))) + sys.stderr.write("Skipping {chrom}:{pos} with {numalt} ALT alleles\n".format( + chrom=record.CHROM, pos=record.POS, numalt=len(record.ALT)) + ) continue # Check if too few alleles # Include cases where there is an ALT listed but the AF is 0 if (args.min_alleles != -1 and (1+len(record.ALT)) < args.min_alleles) or \ (record.INFO["AF"]==0): - sys.stderr.write("Skipping {}:{} with {} ALT alleles, AF={}\n".format(record.CHROM, record.POS, len(record.ALT), str(record.INFO["AF"]))) + sys.stderr.write("Skipping {chrom}:{pos} with {numalt} ALT alleles, AF={alleles}\n".format( + chrom=record.CHROM, pos=record.POS, numalt=len(record.ALT), alleles=str(record.INFO["AF"])) + ) continue # Modify ID record.ID = GetTRRecordID(record, allids) @@ -101,11 +108,7 @@ def main(args): reader.close() writer.close() - - # Run bash script to convert to bref3 - cmd = "bash convert_to_bref3.sh {filename}".format(filename=args.out) - output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout.read() - print(output.decode("utf-8")) + sys.exit(0) if __name__ == "__main__": run() \ No newline at end of file From 9923347f36a7a24e9724b8a0a52c320c07d6e8e7 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 12:12:13 -0700 Subject: [PATCH 10/27] overhaul of fixref script to remove alleles with count=0 --- scripts/fix-ref/README.md | 10 +- .../fix_ensembletr_snpstr_reference.py | 139 ++++++++++++++---- 2 files changed, 117 insertions(+), 32 deletions(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index c30e894..2dcc403 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -16,11 +16,15 @@ chrom=11 --vcf chr${chrom}_final_SNP_merged_additional_TRs.vcf.gz \ --ref Homo_sapiens_assembly38.fasta \ --out ensembletr_refpanel_v3_chr${chrom} \ - --max-alleles 50 --min-alleles 2 + --max-alleles 100 --min-alleles 2 +bgzip ensembletr_refpanel_v3_chr${chrom}.vcf.gz +tabix -p vcf ensembletr_refpanel_v3_chr${chrom}.vcf.gz ``` ## Step 2: Convert to bref -``` -TODO +wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar +chrom=11 +zcat ensembletr_refpanel_v3_chr${chrom}.vcf.gz | \ + java -jar bref3.27May24.118.jar > ensembletr_refpanel_v3_chr${chrom}.bref ``` \ No newline at end of file diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 43e7799..6a3adc1 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -2,8 +2,6 @@ """ Script to clean up SNP/TR reference haplotype panels - -TODO: remove alleles with AC=0 """ import argparse @@ -12,26 +10,97 @@ import sys import subprocess import os - +import tempfile MAX_ALLOWED_DUPS = 1000 +def GetWriter(fname, reader): + """ + Get regular file writer, not cyvcf2.Writer + Since we are going to update INFO/FORMAT downstream + """ + outf = open(fname, "w") + # Get header from reader, but remove DS/GP + header = reader.raw_header + for line in header.split("\n"): + if line.startswith("##FORMAT= 0: allele_counts[a] = acount + # Keep order same as original + allele_order = [item for item in alleles if item in allele_counts.keys()] + return allele_counts, allele_order + +def WriteRecord(writer, record): + """simple function to write the record as is """ + writer.write(str(record).strip()+"\n") + +def UpdateINFO(INFO, allele_counts, allele_order): + """ + Keep INFO as is, except update AC/AF + """ + info_items = [] + for infokey in dict(INFO).keys(): + if infokey in ["AC", "AF"]: + continue + else: + # Note all other INFO fields are single values + infoval = INFO.get(infokey) + info_items.append(f"{infokey}={infoval}") + # Add AC/AF + num_chroms = sum(allele_counts.values()) + ac_vals = [allele_counts[a] for a in allele_order[1:]] + af_vals = [allele_counts[a]/num_chroms for a in allele_order[1:]] + info_items.append("AC=" + ",".join([str(item) for item in ac_vals])) + info_items.append("AF=" + ",".join(["%.4f"%item for item in af_vals])) + return ";".join(info_items) + +def GetGT(sample, orig_alleles, allele_order): + """Update the GT based on new allele order""" + if sample[2] == True: + sep = "|" + else: sep = "/" + a1 = allele_order.index(orig_alleles[sample[0]]) + a2 = allele_order.index(orig_alleles[sample[1]]) + return f"{a1}{sep}{a2}" + def IsTRRecord(record_id): + """Check if the record is a TR""" return record_id is None or record_id.strip() == "." def GetTRRecordID(record, allids): - locid = "EnsTR:{chrom}:{pos}"%.format(chrom=record.CHROM, pos=record.POS) + """Get new uniquified record ID""" + locid = "EnsTR:{chrom}:{pos}".format(chrom=record.CHROM, pos=record.POS) if locid not in allids: return locid for i in range(1, MAX_ALLOWED_DUPS): - newlocid = "{locid}:{i}" + newlocid = f"{locid}:{i}" if newlocid not in allids: - sys.stderr.write("Adding duplicate locus {newlocid}\n") + sys.stderr.write(f"Adding duplicate locus {newlocid}\n") return newlocid - raise ValueError("Error: too many duplicates of {locid}") + raise ValueError(f"Error: too many duplicates of {locid}") def CheckReference(record, refgenome): - # REF in VCF should match what is in the reference genome + """ REF in VCF should match what is in the reference genome""" refseq = refgenome[record.CHROM][record.POS-1:record.POS-1+len(record.REF)] return (refseq == record.REF) @@ -65,12 +134,14 @@ def main(args): # Set up writer, adding the missing header reader.add_to_header('##command=hipstr;note this is a dummy header line') reader.add_to_header('##INFO=') - writer = cyvcf2.Writer(args.out + ".vcf", reader) + writer = GetWriter(args.out + ".vcf", reader) # Go through each record # If SNP: just print it - # if STR: filter records with incorrect reference, - # and modify ID + # if STR: (1) Filter records with incorrect reference + # (2) Filter too many/too few alleles + # (3) Modify record ID + # (4) Write record, without DP/GS, with updated AF/AC and without AC=0 allids = set() num_records_processed = 0 for record in reader: @@ -79,32 +150,42 @@ def main(args): break # for debug if not IsTRRecord(record.ID): record.INFO["VT"] = "OTHER" - writer.write_record(record) + WriteRecord(writer, record) else: - # Check reference + # (1) Filter records with incorrect reference if not CheckReference(record, refgenome): continue # skip this record - # TODO - remove alleles with AF=0 - # Check if too many alleles - if args.max_alleles != -1 and (1+len(record.ALT)) > args.max_alleles: - sys.stderr.write("Skipping {chrom}:{pos} with {numalt} ALT alleles\n".format( - chrom=record.CHROM, pos=record.POS, numalt=len(record.ALT)) + + # (2) Filter too many/too few alleles + allele_order = [record.REF] + record.ALT + # Note: GetAlleleCounts() doesn't include things with AC=0 + allele_counts, allele_order = GetAlleleCounts(record) + num_alleles = len(allele_counts.keys()) + if args.max_alleles != -1 and num_alleles > args.max_alleles: + sys.stderr.write("Skipping {chrom}:{pos} with {num} alleles\n".format( + chrom=record.CHROM, pos=record.POS, num=num_alleles) ) - continue - # Check if too few alleles - # Include cases where there is an ALT listed but the AF is 0 - if (args.min_alleles != -1 and (1+len(record.ALT)) < args.min_alleles) or \ - (record.INFO["AF"]==0): - sys.stderr.write("Skipping {chrom}:{pos} with {numalt} ALT alleles, AF={alleles}\n".format( - chrom=record.CHROM, pos=record.POS, numalt=len(record.ALT), alleles=str(record.INFO["AF"])) + continue # skip this record + if args.min_alleles != -1 and num_alleles < args.min_alleles: + sys.stderr.write("Skipping {chrom}:{pos} with {num} alleles\n".format( + chrom=record.CHROM, pos=record.POS, num=num_alleles) ) - continue - # Modify ID + continue # skip this record + + # (3) Modify record ID record.ID = GetTRRecordID(record, allids) allids.add(record.ID) record.INFO["VT"] = "TR" - # Write to file - writer.write_record(record) + + # (4) Write record to file, update AC/AF, only include GT, exclude AC=0 + orig_alleles = [record.REF] + record.ALT + updated_info = UpdateINFO(record.INFO, allele_counts, allele_order) + out_items = [record.CHROM, record.POS, record.ID, \ + allele_order[0], ",".join(allele_order[1:]), \ + ".", "PASS", updated_info, "GT"] + for sample in record.genotypes: + out_items.append(GetGT(sample, orig_alleles, allele_order)) + writer.write("\t".join([str(item) for item in out_items])+"\n") reader.close() writer.close() From dbba8ab8b7b2560707d0276326d06306a93150c7 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 12:12:48 -0700 Subject: [PATCH 11/27] remove convert to bref script --- scripts/fix-ref/convert_to_bref3.sh | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 scripts/fix-ref/convert_to_bref3.sh diff --git a/scripts/fix-ref/convert_to_bref3.sh b/scripts/fix-ref/convert_to_bref3.sh deleted file mode 100644 index 63e0d96..0000000 --- a/scripts/fix-ref/convert_to_bref3.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -ref=$1 -bref="bref3.27May24.118.jar" - - - -#bgzip and index ref -echo "bgzip and index ref" -bgzip ${ref}.vcf -tabix -p vcf ${ref}.vcf.gz - -#dowaload bref3.jar -# Check if the file exists locally -if [ ! -f "$bref" ]; then - echo "File does not exist. Downloading..." - wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar -else - echo "File already exists. Continuing..." -fi - -#convert vcf to bref3 -echo "converting vcf to bref3 format" -zcat < ${ref}.vcf.gz | java -jar $bref > ${ref}.bref3 -echo "Done converting" - - \ No newline at end of file From ee145ec10afa844c6118418e97a3fe7e64d53309 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 12:18:54 -0700 Subject: [PATCH 12/27] update to write to stdout so we can pipe to bgzip --- scripts/fix-ref/README.md | 4 +--- scripts/fix-ref/fix_ensembletr_snpstr_reference.py | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 2dcc403..5e96e64 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -15,9 +15,7 @@ chrom=11 ./fix_ensembletr_snpstr_reference.py \ --vcf chr${chrom}_final_SNP_merged_additional_TRs.vcf.gz \ --ref Homo_sapiens_assembly38.fasta \ - --out ensembletr_refpanel_v3_chr${chrom} \ - --max-alleles 100 --min-alleles 2 -bgzip ensembletr_refpanel_v3_chr${chrom}.vcf.gz + --max-alleles 100 --min-alleles 2 | bgzip -c > ensembletr_refpanel_v3_chr${chrom}.vcf.gz tabix -p vcf ensembletr_refpanel_v3_chr${chrom}.vcf.gz ``` diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 6a3adc1..92cd834 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -19,7 +19,9 @@ def GetWriter(fname, reader): Get regular file writer, not cyvcf2.Writer Since we are going to update INFO/FORMAT downstream """ - outf = open(fname, "w") + if fname == "stdout.vcf": + outf = sys.stdout + else: outf = open(fname, "w") # Get header from reader, but remove DS/GP header = reader.raw_header for line in header.split("\n"): @@ -116,7 +118,7 @@ def getargs(): # pragma: no cover parser = argparse.ArgumentParser(__doc__) parser.add_argument("--vcf", help="Input SNP/TR VCF file", type=str, required=True) parser.add_argument("--ref", help="Reference fasta file", type=str, required=True) - parser.add_argument("--out", help="Prefix for output VCF file", type=str, required=True) + parser.add_argument("--out", help="Prefix for output VCF file. or 'stdout' for standard output", type=str, default="stdout") parser.add_argument("--max-alleles", help="Ignore loci with more than this many alleles", type=int, default=-1) parser.add_argument("--min-alleles", help="Ignore loci with fewer than this many alleles", type=int, default=-1) parser.add_argument("--max-records", help="Quit after processing this many records (for debug)", \ From 403c1d7e07bc34d05e923e4d98e3516f48e7f959 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 12:23:45 -0700 Subject: [PATCH 13/27] fix readme format issue --- scripts/fix-ref/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 5e96e64..6248aa2 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -21,6 +21,7 @@ tabix -p vcf ensembletr_refpanel_v3_chr${chrom}.vcf.gz ## Step 2: Convert to bref +``` wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar chrom=11 zcat ensembletr_refpanel_v3_chr${chrom}.vcf.gz | \ From e76b3cd238cc9ef5e5630184946665770627bcfc Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Fri, 6 Sep 2024 12:24:30 -0700 Subject: [PATCH 14/27] fix readme format issue --- scripts/fix-ref/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 6248aa2..c17e03c 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -6,8 +6,10 @@ This step: * Removes records with incorrect reference allele * Adds required header line for TRTools * Adds informative variant IDs to STRs +* Adds VT field * Remove loci with too many or too few alleles -* Remove loci with AF=0 +* Remove alleles with AF=0 +* Strip DS/GP fields ``` # Test on chr11 From f1bcc576e99b8ff9d638e5059874ccaa40ce2b4c Mon Sep 17 00:00:00 2001 From: Yang_Li Date: Mon, 9 Sep 2024 13:25:20 -0700 Subject: [PATCH 15/27] update the main README.md and make some changes on the fix_ensembletr_snpstr_reference.py --- README.md | 10 +++++++ .../fix_ensembletr_snpstr_reference.py | 27 +++++++++++-------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 9014d9a..8b0c686 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,16 @@ https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered For version I of phased panels, please use https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz.csi for tbi file. +## Version III + +All files description and download links can be downloaded [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). The version III of EnsembleTR calls set fix the following issues: +1. Remove TRs are not match the reference record. +2. Remove TRs have more than 100 alleles. +3. Remove TRs have less than 2 alleles. +4. For each loci, remove alelles with 0 count. DS/GP field also been removed. +5. For TRs with the same POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. Add VT field. +6. Add the .bref format files. + ## Notes on HipSTR input HipSTR might expand the coordinates of the repeat if there is a nearby SNP. If you have multiple HipSTR outputs from different individuals and want to use mergeSTR to merge them, please use our python script, *Hipstr_correction.py*, to correct the merged HipSTR VCF file ensuring that multiple records from the same repeat culminate in a single unified record. diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 92cd834..aea784a 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -146,6 +146,12 @@ def main(args): # (4) Write record, without DP/GS, with updated AF/AC and without AC=0 allids = set() num_records_processed = 0 + num_snps_keeped = 0 + num_strs_keeped = 0 + num_str_failed_ref = 0 + num_str_failed_max_allele = 0 + num_str_failed_min_allele = 0 + for record in reader: num_records_processed += 1 if args.max_records > 0 and num_records_processed > args.max_records: @@ -153,32 +159,31 @@ def main(args): if not IsTRRecord(record.ID): record.INFO["VT"] = "OTHER" WriteRecord(writer, record) + num_snps_keeped += 1 else: # (1) Filter records with incorrect reference if not CheckReference(record, refgenome): + num_str_failed_ref += 1 continue # skip this record # (2) Filter too many/too few alleles - allele_order = [record.REF] + record.ALT # Note: GetAlleleCounts() doesn't include things with AC=0 allele_counts, allele_order = GetAlleleCounts(record) num_alleles = len(allele_counts.keys()) if args.max_alleles != -1 and num_alleles > args.max_alleles: - sys.stderr.write("Skipping {chrom}:{pos} with {num} alleles\n".format( - chrom=record.CHROM, pos=record.POS, num=num_alleles) - ) + num_str_failed_max_allele += 1 + sys.stderr.write(f"Skipping {record.CHROM}:{record.POS} with {num_alleles} alleles\n") continue # skip this record if args.min_alleles != -1 and num_alleles < args.min_alleles: - sys.stderr.write("Skipping {chrom}:{pos} with {num} alleles\n".format( - chrom=record.CHROM, pos=record.POS, num=num_alleles) - ) + num_str_failed_min_allele += 1 + sys.stderr.write(f"Skipping {record.CHROM}:{record.POS} with {num_alleles} alleles\n") continue # skip this record # (3) Modify record ID record.ID = GetTRRecordID(record, allids) allids.add(record.ID) - record.INFO["VT"] = "TR" - + record.INFO["VT"] = "TR" + num_strs_keeped += 1 # (4) Write record to file, update AC/AF, only include GT, exclude AC=0 orig_alleles = [record.REF] + record.ALT updated_info = UpdateINFO(record.INFO, allele_counts, allele_order) @@ -188,10 +193,10 @@ def main(args): for sample in record.genotypes: out_items.append(GetGT(sample, orig_alleles, allele_order)) writer.write("\t".join([str(item) for item in out_items])+"\n") - + sys.stdout.write(f"All {num_records_processed:,} records processed: keep {num_snps_keeped:,} snps, {num_strs_keeped:,} STRs; remove {num_str_failed_ref:,} STRs mismatch with reference, {num_str_failed_max_allele:,} STRs with more than {args.max_alleles:,} alleles, {num_str_failed_min_allele:,} STRs less than {args.min_alleles} alleles.\n") reader.close() writer.close() sys.exit(0) if __name__ == "__main__": - run() \ No newline at end of file + run() From 70d2b665bc777dd2f38086a2f8a2ccb27b34974e Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:02:34 -0700 Subject: [PATCH 16/27] adding links to v3 panel --- README.md | 106 +++++++++++++-------------------- archive_ensembletr_datasets.md | 89 +++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 65 deletions(-) create mode 100644 archive_ensembletr_datasets.md diff --git a/README.md b/README.md index 8b0c686..587da08 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,10 @@ statSTR --vcf EnsembleTR_file.vcf.gz --out EnsembleTR_per_locus_allele_frequency ``` +# EnsembleTR data releases + +Archived datasets, including the Version II calls and Version II haplotype panel files can be found [here](archive_ensembletr_datasets.md). + ## Version II of EnsembleTR calls on samples from 1000 Genomes Project and H3Africa Chromosome 1 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr1_filtered.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr1_filtered.vcf.gz.tbi) @@ -125,101 +129,73 @@ Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntr Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr22_filtered.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr22_filtered.vcf.gz.tbi) -## Version II of reference SNP+TR haplotype panel for imputation of TR variants - -### Dataset description - -[Phased variants](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/) of 3,202 samples from the 1000 Genomes Project (1kGP). +## Version III of reference SNP+TR haplotype panel for imputation of TR variants -TRs imputed from 3,202 1kGP samples. +These files contain: +* [Phased SNP and indel variants](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/) of 3,202 samples from the 1000 Genomes Project (1kGP). +* TRs phased/imputed from 3,202 1kGP samples based on EnsembleTR calls. -Total 70,692,015 variants + 1,091,550 TR markers. +There are in total 1,070,698 TRs and 70,692,015 SNPs/indels. All the coordinates are based on **hg38** human reference genome. -### Availability - -Chromosome 1 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 2 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 3 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 4 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz.tbi) +These files contain the same data as [Version II](archive_ensembletr_datasets.md), with the following updates to facilitate use in downstream imputation pipelines: -Chromosome 5 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz.tbi) +1. Remove TRs for which the REF allele does not match the expected sequence based on CHR:POS +2. For each TR, remove alelles with 0 count. +3. Remove TRs which have more than 100 alleles. +4. Remove TRs which have less than 2 alleles. +5. Remove the DS/GP fields which are large and not used by downstream steps. +6. Add unique IDs for each TR of the format EnsTR:CHROM:POS. For TRs with the same CHR:POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. +7. Add VT field, set to VT=TR for TRs and VT=OTHER for other variant types +8. Add the .bref format files which have the same information as the VCFs but can improve Beagle imputation performance. -Chromosome 6 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz.tbi) +All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). Data and links for each chromosome for the Verson III panel are also provided below. -Chromosome 7 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 1 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref) SNPs/indels=5,759,060 TRs=92,372 -Chromosome 8 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 2 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref) SNPs/indels=6,088,598 TRs=91,133 -Chromosome 9 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 3 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref) SNPs/indels=4,983,185 TRs=75,233 -Chromosome 10 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 4 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref) SNPs/indels=4,875,465 TRs=69,325 -Chromosome 11 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 5 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref) SNPs/indels=4,536,819 TRs=66,493 -Chromosome 12 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 6 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref) SNPs/indels=4,315,217 TRs=65,938 -Chromosome 13 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 7 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref) SNPs/indels=4,137,254 TRs=59,410 -Chromosome 14 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 8 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref) SNPs/indels=3,886,222 TRs=55,141 -Chromosome 15 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 9 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref) SNPs/indels=3,165,513 TRs=44,188 -Chromosome 16 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 10 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref) SNPs/indels=3,495,473 TRs=51,637 -Chromosome 17 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 11 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref) SNPs/indels=3,423,341 TRs=49,599 -Chromosome 18 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 12 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref) SNPs/indels=3,332,788 TRs=55,886 -Chromosome 19 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 20 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -### Usage - -Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impute TRs into SNP data: - -``` -java -Xmx4g -jar beagle.version.jar \ - gt=SNPs.vcf.gz \ - ref=${chrom}_final_SNP_merged.vcf.gz \ - out=imputed_TR_SNPs -``` +Chromosome 13 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref) SNPs/indels=2,509,179 TRs=35,720 -Please use the [version 5.4](https://github.com/gymreklab/1000Genomes-TR-Analysis/raw/main/phasing/validation/beagle.19Apr22.7c0.jar) for this analysis as we had issues with the newer versions of Beagle and we are right now communicating it with Beagle developers. +Chromosome 14 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref) SNPs/indels=2,290,400 TRs=36,199 -## Additional resources +Chromosome 15 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref) SNPs/indels=2,109,285 TRs=32,336 -Per locus summary statistics can be downloaded from [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/repeat_tables.zip). Each table has information on coordinates, repeat unit sequence, and potential overlap with genes listed in GENCODE v22 for repeats in EnsembleTR catalog. +Chromosome 16 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref) SNPs/indels=2,362,361 TRs=35,451 -Population-specific per locus statistics on allele frequency, heterozygosity, and the number of called samples can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/afreq_tables.zip). Statistics are computed using statSTR from the TRTools package. +Chromosome 17 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref) SNPs/indels=2,073,624 TRs=38,377 +Chromosome 18 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref) SNPs/indels=1,963,845 TRs=28,444 -## Version I +Chromosome 19 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref) SNPs/indels=1,670,692 TRs=33,536 -For version I of EnsembleTR calls, please use -https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz.tbi for tbi file. +Chromosome 20 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref) SNPs/indels=1,644,384 TRs=25,743 -For version I of phased panels, please use -https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz.csi for tbi file. +Chromosome 21 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref) SNPs/indels=1,002,753 TRs=12,894 -## Version III +Chromosome 22 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref) SNPs/indels=1,066,557 TRs=15,643 -All files description and download links can be downloaded [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). The version III of EnsembleTR calls set fix the following issues: -1. Remove TRs are not match the reference record. -2. Remove TRs have more than 100 alleles. -3. Remove TRs have less than 2 alleles. -4. For each loci, remove alelles with 0 count. DS/GP field also been removed. -5. For TRs with the same POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. Add VT field. -6. Add the .bref format files. ## Notes on HipSTR input diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md new file mode 100644 index 0000000..a415a18 --- /dev/null +++ b/archive_ensembletr_datasets.md @@ -0,0 +1,89 @@ +# EnsembleTR archived datasets + +This page contains links to older versions of ensembleTR files + +## Version II of reference SNP+TR haplotype panel for imputation of TR variants + +### Dataset description + +[Phased variants](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/) of 3,202 samples from the 1000 Genomes Project (1kGP). + +TRs imputed from 3,202 1kGP samples. + +Total 70,692,015 variants + 1,091,550 TR markers. + +All the coordinates are based on **hg38** human reference genome. + +### Availability + +Chromosome 1 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 2 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 3 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 4 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 5 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 6 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 7 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 8 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 9 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 10 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 11 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 12 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 13 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 14 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 15 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 16 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 17 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 18 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 19 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 20 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +### Usage + +Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impute TRs into SNP data: + +``` +java -Xmx4g -jar beagle.version.jar \ + gt=SNPs.vcf.gz \ + ref=${chrom}_final_SNP_merged.vcf.gz \ + out=imputed_TR_SNPs +``` + +Please use the [version 5.4](https://github.com/gymreklab/1000Genomes-TR-Analysis/raw/main/phasing/validation/beagle.19Apr22.7c0.jar) for this analysis as we had issues with the newer versions of Beagle and we are right now communicating it with Beagle developers. + +## Additional resources + +Per locus summary statistics can be downloaded from [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/repeat_tables.zip). Each table has information on coordinates, repeat unit sequence, and potential overlap with genes listed in GENCODE v22 for repeats in EnsembleTR catalog. + +Population-specific per locus statistics on allele frequency, heterozygosity, and the number of called samples can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/afreq_tables.zip). Statistics are computed using statSTR from the TRTools package. + + +## Version I + +For version I of EnsembleTR calls, please use +https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz.tbi for tbi file. + +For version I of phased panels, please use +https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz.csi for tbi file. \ No newline at end of file From 510820f5c4fe6b9a6ffab80306bf2a6a990a914b Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:03:40 -0700 Subject: [PATCH 17/27] adding links to v3 panel --- README.md | 6 ++++++ archive_ensembletr_datasets.md | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 587da08..3db6cf8 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,12 @@ Chromosome 21 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-re Chromosome 22 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref) SNPs/indels=1,066,557 TRs=15,643 +## Additional resources + +Per locus summary statistics can be downloaded from [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/repeat_tables.zip). Each table has information on coordinates, repeat unit sequence, and potential overlap with genes listed in GENCODE v22 for repeats in EnsembleTR catalog. + +Population-specific per locus statistics on allele frequency, heterozygosity, and the number of called samples can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/afreq_tables.zip). Statistics are computed using statSTR from the TRTools package. + ## Notes on HipSTR input diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md index a415a18..b511645 100644 --- a/archive_ensembletr_datasets.md +++ b/archive_ensembletr_datasets.md @@ -73,13 +73,6 @@ java -Xmx4g -jar beagle.version.jar \ Please use the [version 5.4](https://github.com/gymreklab/1000Genomes-TR-Analysis/raw/main/phasing/validation/beagle.19Apr22.7c0.jar) for this analysis as we had issues with the newer versions of Beagle and we are right now communicating it with Beagle developers. -## Additional resources - -Per locus summary statistics can be downloaded from [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/repeat_tables.zip). Each table has information on coordinates, repeat unit sequence, and potential overlap with genes listed in GENCODE v22 for repeats in EnsembleTR catalog. - -Population-specific per locus statistics on allele frequency, heterozygosity, and the number of called samples can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/afreq_tables.zip). Statistics are computed using statSTR from the TRTools package. - - ## Version I For version I of EnsembleTR calls, please use From 3503a58f6e1be17b0c2688fd4adf6ed36eff8e56 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:09:56 -0700 Subject: [PATCH 18/27] adding links to v3 panel --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 3db6cf8..bd8bfb5 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,19 @@ Chromosome 21 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-re Chromosome 22 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref) SNPs/indels=1,066,557 TRs=15,643 +### Usage + +Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impute TRs into SNP data: + +``` +java -Xmx4g -jar beagle.version.jar \ + gt=SNPs_chr${chrom}.vcf.gz \ + ref=ensembletr_refpanel_v3_chr${chrom}.bref \ + out=imputed_TR_SNPs_chr${chrom} +``` + +We have tested this with Beagle jar file [beagle.27May24.118.jar](https://faculty.washington.edu/browning/beagle/beagle.27May24.118.jar). Earlier releases of Beagle 5.4 had problems imputing from this panel due to a file decompression issue. + ## Additional resources Per locus summary statistics can be downloaded from [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/tables/repeat_tables.zip). Each table has information on coordinates, repeat unit sequence, and potential overlap with genes listed in GENCODE v22 for repeats in EnsembleTR catalog. From b8fbefd28e933bcbb31db567f642858cdd73178f Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:10:21 -0700 Subject: [PATCH 19/27] adding links to v3 panel --- archive_ensembletr_datasets.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md index b511645..5716122 100644 --- a/archive_ensembletr_datasets.md +++ b/archive_ensembletr_datasets.md @@ -60,18 +60,6 @@ Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/addition Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz.tbi) -### Usage - -Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impute TRs into SNP data: - -``` -java -Xmx4g -jar beagle.version.jar \ - gt=SNPs.vcf.gz \ - ref=${chrom}_final_SNP_merged.vcf.gz \ - out=imputed_TR_SNPs -``` - -Please use the [version 5.4](https://github.com/gymreklab/1000Genomes-TR-Analysis/raw/main/phasing/validation/beagle.19Apr22.7c0.jar) for this analysis as we had issues with the newer versions of Beagle and we are right now communicating it with Beagle developers. ## Version I From 19cec77639233e7e949b74781c70be7c77aed709 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:11:58 -0700 Subject: [PATCH 20/27] adding links to v3 panel --- archive_ensembletr_datasets.md | 45 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md index 5716122..29e2d5f 100644 --- a/archive_ensembletr_datasets.md +++ b/archive_ensembletr_datasets.md @@ -16,50 +16,49 @@ All the coordinates are based on **hg38** human reference genome. ### Availability -Chromosome 1 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref)] SNPs/indels=5,759,060 TRs=92,372 -Chromosome 2 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref)] SNPs/indels=6,088,598 TRs=91,133 -Chromosome 3 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref)] SNPs/indels=4,983,185 TRs=75,233 -Chromosome 4 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref)] SNPs/indels=4,875,465 TRs=69,325 -Chromosome 5 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref)] SNPs/indels=4,536,819 TRs=66,493 -Chromosome 6 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref)] SNPs/indels=4,315,217 TRs=65,938 -Chromosome 7 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref)] SNPs/indels=4,137,254 TRs=59,410 -Chromosome 8 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref)] SNPs/indels=3,886,222 TRs=55,141 -Chromosome 9 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref)] SNPs/indels=3,165,513 TRs=44,188 -Chromosome 10 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref)] SNPs/indels=3,495,473 TRs=51,637 -Chromosome 11 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref)] SNPs/indels=3,423,341 TRs=49,599 -Chromosome 12 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref)] SNPs/indels=3,332,788 TRs=55,886 -Chromosome 13 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref)] SNPs/indels=2,509,179 TRs=35,720 -Chromosome 14 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref)] SNPs/indels=2,290,400 TRs=36,199 -Chromosome 15 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref)] SNPs/indels=2,109,285 TRs=32,336 -Chromosome 16 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref)] SNPs/indels=2,362,361 TRs=35,451 -Chromosome 17 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref)] SNPs/indels=2,073,624 TRs=38,377 -Chromosome 18 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref)] SNPs/indels=1,963,845 TRs=28,444 -Chromosome 19 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref)] SNPs/indels=1,670,692 TRs=33,536 -Chromosome 20 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref)] SNPs/indels=1,644,384 TRs=25,743 -Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz.tbi) - -Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz.tbi) +Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref)] SNPs/indels=1,002,753 TRs=12,894 +Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref)] SNPs/indels=1,066,557 TRs=15,643 ## Version I From 507e4c99ed4ea19badf0c244f358696410b8b202 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 9 Sep 2024 20:13:10 -0700 Subject: [PATCH 21/27] adding links to v3 panel --- README.md | 44 ++++++++++++++++----------------- archive_ensembletr_datasets.md | 45 +++++++++++++++++----------------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index bd8bfb5..58541a0 100644 --- a/README.md +++ b/README.md @@ -152,49 +152,49 @@ These files contain the same data as [Version II](archive_ensembletr_datasets.md All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). Data and links for each chromosome for the Verson III panel are also provided below. -Chromosome 1 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref) SNPs/indels=5,759,060 TRs=92,372 +Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref)] SNPs/indels=5,759,060 TRs=92,372 -Chromosome 2 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref) SNPs/indels=6,088,598 TRs=91,133 +Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref)] SNPs/indels=6,088,598 TRs=91,133 -Chromosome 3 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref) SNPs/indels=4,983,185 TRs=75,233 +Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref)] SNPs/indels=4,983,185 TRs=75,233 -Chromosome 4 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref) SNPs/indels=4,875,465 TRs=69,325 +Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref)] SNPs/indels=4,875,465 TRs=69,325 -Chromosome 5 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref) SNPs/indels=4,536,819 TRs=66,493 +Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref)] SNPs/indels=4,536,819 TRs=66,493 -Chromosome 6 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref) SNPs/indels=4,315,217 TRs=65,938 +Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref)] SNPs/indels=4,315,217 TRs=65,938 -Chromosome 7 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref) SNPs/indels=4,137,254 TRs=59,410 +Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref)] SNPs/indels=4,137,254 TRs=59,410 -Chromosome 8 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref) SNPs/indels=3,886,222 TRs=55,141 +Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref)] SNPs/indels=3,886,222 TRs=55,141 -Chromosome 9 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref) SNPs/indels=3,165,513 TRs=44,188 +Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref)] SNPs/indels=3,165,513 TRs=44,188 -Chromosome 10 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref) SNPs/indels=3,495,473 TRs=51,637 +Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref)] SNPs/indels=3,495,473 TRs=51,637 -Chromosome 11 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref) SNPs/indels=3,423,341 TRs=49,599 +Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref)] SNPs/indels=3,423,341 TRs=49,599 -Chromosome 12 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref) SNPs/indels=3,332,788 TRs=55,886 +Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref)] SNPs/indels=3,332,788 TRs=55,886 -Chromosome 13 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref) SNPs/indels=2,509,179 TRs=35,720 +Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref)] SNPs/indels=2,509,179 TRs=35,720 -Chromosome 14 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref) SNPs/indels=2,290,400 TRs=36,199 +Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref)] SNPs/indels=2,290,400 TRs=36,199 -Chromosome 15 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref) SNPs/indels=2,109,285 TRs=32,336 +Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref)] SNPs/indels=2,109,285 TRs=32,336 -Chromosome 16 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref) SNPs/indels=2,362,361 TRs=35,451 +Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref)] SNPs/indels=2,362,361 TRs=35,451 -Chromosome 17 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref) SNPs/indels=2,073,624 TRs=38,377 +Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref)] SNPs/indels=2,073,624 TRs=38,377 -Chromosome 18 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref) SNPs/indels=1,963,845 TRs=28,444 +Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref)] SNPs/indels=1,963,845 TRs=28,444 -Chromosome 19 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref) SNPs/indels=1,670,692 TRs=33,536 +Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref)] SNPs/indels=1,670,692 TRs=33,536 -Chromosome 20 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref) SNPs/indels=1,644,384 TRs=25,743 +Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref)] SNPs/indels=1,644,384 TRs=25,743 -Chromosome 21 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref) SNPs/indels=1,002,753 TRs=12,894 +Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref)] SNPs/indels=1,002,753 TRs=12,894 -Chromosome 22 [VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz) [tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi) [bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref) SNPs/indels=1,066,557 TRs=15,643 +Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref)] SNPs/indels=1,066,557 TRs=15,643 ### Usage diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md index 29e2d5f..5716122 100644 --- a/archive_ensembletr_datasets.md +++ b/archive_ensembletr_datasets.md @@ -16,49 +16,50 @@ All the coordinates are based on **hg38** human reference genome. ### Availability -Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref)] SNPs/indels=5,759,060 TRs=92,372 +Chromosome 1 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr1_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref)] SNPs/indels=6,088,598 TRs=91,133 +Chromosome 2 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr2_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref)] SNPs/indels=4,983,185 TRs=75,233 +Chromosome 3 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr3_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref)] SNPs/indels=4,875,465 TRs=69,325 +Chromosome 4 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr4_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref)] SNPs/indels=4,536,819 TRs=66,493 +Chromosome 5 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr5_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref)] SNPs/indels=4,315,217 TRs=65,938 +Chromosome 6 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr6_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref)] SNPs/indels=4,137,254 TRs=59,410 +Chromosome 7 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr7_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref)] SNPs/indels=3,886,222 TRs=55,141 +Chromosome 8 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr8_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref)] SNPs/indels=3,165,513 TRs=44,188 +Chromosome 9 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr9_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref)] SNPs/indels=3,495,473 TRs=51,637 +Chromosome 10 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr10_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref)] SNPs/indels=3,423,341 TRs=49,599 +Chromosome 11 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr11_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref)] SNPs/indels=3,332,788 TRs=55,886 +Chromosome 12 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr12_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref)] SNPs/indels=2,509,179 TRs=35,720 +Chromosome 13 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr13_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref)] SNPs/indels=2,290,400 TRs=36,199 +Chromosome 14 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr14_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref)] SNPs/indels=2,109,285 TRs=32,336 +Chromosome 15 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr15_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref)] SNPs/indels=2,362,361 TRs=35,451 +Chromosome 16 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr16_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref)] SNPs/indels=2,073,624 TRs=38,377 +Chromosome 17 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr17_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref)] SNPs/indels=1,963,845 TRs=28,444 +Chromosome 18 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr18_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref)] SNPs/indels=1,670,692 TRs=33,536 +Chromosome 19 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr19_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref)] SNPs/indels=1,644,384 TRs=25,743 +Chromosome 20 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr20_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref)] SNPs/indels=1,002,753 TRs=12,894 +Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr21_final_SNP_merged_additional_TRs.vcf.gz.tbi) + +Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/additional-phased-trs/chr22_final_SNP_merged_additional_TRs.vcf.gz.tbi) -Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref)] SNPs/indels=1,066,557 TRs=15,643 ## Version I From 24e1e61465a2dd91ff93fde35376f02b8f29a2b8 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Mon, 23 Sep 2024 13:30:12 -0700 Subject: [PATCH 22/27] use bref3 --- scripts/fix-ref/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index c17e03c..6b50a4c 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -27,5 +27,5 @@ tabix -p vcf ensembletr_refpanel_v3_chr${chrom}.vcf.gz wget https://faculty.washington.edu/browning/beagle/bref3.27May24.118.jar chrom=11 zcat ensembletr_refpanel_v3_chr${chrom}.vcf.gz | \ - java -jar bref3.27May24.118.jar > ensembletr_refpanel_v3_chr${chrom}.bref + java -jar bref3.27May24.118.jar > ensembletr_refpanel_v3_chr${chrom}.bref3 ``` \ No newline at end of file From 4a469207173012bccc24fcf3a72766feebf70544 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Tue, 24 Sep 2024 09:28:40 -0700 Subject: [PATCH 23/27] update fixref script to remove duplicates --- scripts/fix-ref/fix_ensembletr_snpstr_reference.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index aea784a..33c2ccd 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -141,10 +141,12 @@ def main(args): # Go through each record # If SNP: just print it # if STR: (1) Filter records with incorrect reference - # (2) Filter too many/too few alleles + # (2) Filter too many/too few alleles, plus remove if duplicate after that + # (based on chr:pos:alleles) # (3) Modify record ID # (4) Write record, without DP/GS, with updated AF/AC and without AC=0 allids = set() + allloci = set() # chrom:pos:ref:alt to keep track of duplicates num_records_processed = 0 num_snps_keeped = 0 num_strs_keeped = 0 @@ -178,12 +180,20 @@ def main(args): num_str_failed_min_allele += 1 sys.stderr.write(f"Skipping {record.CHROM}:{record.POS} with {num_alleles} alleles\n") continue # skip this record + # Check if duplicate + locinfo = f"{record.CHROM}:{record.POS}:"+",".join(allele_order) + if locinfo in allloci: + sys.stderr.write(f"Skipping duplicate {locinfo}\n") + continue + else: + allloci.add(locinfo) # (3) Modify record ID record.ID = GetTRRecordID(record, allids) allids.add(record.ID) record.INFO["VT"] = "TR" num_strs_keeped += 1 + # (4) Write record to file, update AC/AF, only include GT, exclude AC=0 orig_alleles = [record.REF] + record.ALT updated_info = UpdateINFO(record.INFO, allele_counts, allele_order) @@ -193,7 +203,7 @@ def main(args): for sample in record.genotypes: out_items.append(GetGT(sample, orig_alleles, allele_order)) writer.write("\t".join([str(item) for item in out_items])+"\n") - sys.stdout.write(f"All {num_records_processed:,} records processed: keep {num_snps_keeped:,} snps, {num_strs_keeped:,} STRs; remove {num_str_failed_ref:,} STRs mismatch with reference, {num_str_failed_max_allele:,} STRs with more than {args.max_alleles:,} alleles, {num_str_failed_min_allele:,} STRs less than {args.min_alleles} alleles.\n") + sys.stderr.write(f"All {num_records_processed:,} records processed: keep {num_snps_keeped:,} snps, {num_strs_keeped:,} STRs; remove {num_str_failed_ref:,} STRs mismatch with reference, {num_str_failed_max_allele:,} STRs with more than {args.max_alleles:,} alleles, {num_str_failed_min_allele:,} STRs less than {args.min_alleles} alleles.\n") reader.close() writer.close() sys.exit(0) From 705af298cb8e146dd871daa27f688843d1ef3d05 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Tue, 24 Sep 2024 16:29:57 -0700 Subject: [PATCH 24/27] document rm dups for fix-ref --- scripts/fix-ref/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/fix-ref/README.md b/scripts/fix-ref/README.md index 6b50a4c..4aecf60 100644 --- a/scripts/fix-ref/README.md +++ b/scripts/fix-ref/README.md @@ -9,6 +9,7 @@ This step: * Adds VT field * Remove loci with too many or too few alleles * Remove alleles with AF=0 +* Remove loci that have the same chr:pos:ref:alt after the above steps * Strip DS/GP fields ``` From 842b119c1b1446005756901f9c949438124b84b2 Mon Sep 17 00:00:00 2001 From: Yang_Li Date: Thu, 26 Sep 2024 21:58:22 -0700 Subject: [PATCH 25/27] fix file name, remove duplicate loci and update README --- README.md | 50 +++++++++---------- .../fix_ensembletr_snpstr_reference.py | 5 +- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 58541a0..5881c7c 100644 --- a/README.md +++ b/README.md @@ -146,55 +146,55 @@ These files contain the same data as [Version II](archive_ensembletr_datasets.md 3. Remove TRs which have more than 100 alleles. 4. Remove TRs which have less than 2 alleles. 5. Remove the DS/GP fields which are large and not used by downstream steps. -6. Add unique IDs for each TR of the format EnsTR:CHROM:POS. For TRs with the same CHR:POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. +6. Add unique IDs for each TR of the format EnsTR:CHROM:POS. For TRs with the same CHR:POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. Duplicated loci with identical alleles are removed. 7. Add VT field, set to VT=TR for TRs and VT=OTHER for other variant types -8. Add the .bref format files which have the same information as the VCFs but can improve Beagle imputation performance. +8. Add the bref format files which have the same information as the VCFs but can improve Beagle imputation performance. All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). Data and links for each chromosome for the Verson III panel are also provided below. -Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref)] SNPs/indels=5,759,060 TRs=92,372 +Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref3)] SNPs/indels=5,759,060 TRs=92,372 -Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref)] SNPs/indels=6,088,598 TRs=91,133 +Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref3)] SNPs/indels=6,088,598 TRs=91,132 -Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref)] SNPs/indels=4,983,185 TRs=75,233 +Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref3)] SNPs/indels=4,983,185 TRs=75,233 -Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref)] SNPs/indels=4,875,465 TRs=69,325 +Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref3)] SNPs/indels=4,875,465 TRs=69,325 -Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref)] SNPs/indels=4,536,819 TRs=66,493 +Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref3)] SNPs/indels=4,536,819 TRs=66,492 -Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref)] SNPs/indels=4,315,217 TRs=65,938 +Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref3)] SNPs/indels=4,315,217 TRs=65,937 -Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref)] SNPs/indels=4,137,254 TRs=59,410 +Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref3)] SNPs/indels=4,137,254 TRs=59,409 -Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref)] SNPs/indels=3,886,222 TRs=55,141 +Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref3)] SNPs/indels=3,886,222 TRs=55,141 -Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref)] SNPs/indels=3,165,513 TRs=44,188 +Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref3)] SNPs/indels=3,165,513 TRs=44,188 -Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref)] SNPs/indels=3,495,473 TRs=51,637 +Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref3)] SNPs/indels=3,495,473 TRs=51,637 -Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref)] SNPs/indels=3,423,341 TRs=49,599 +Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref3)] SNPs/indels=3,423,341 TRs=49,598 -Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref)] SNPs/indels=3,332,788 TRs=55,886 +Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref3)] SNPs/indels=3,332,788 TRs=55,883 -Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref)] SNPs/indels=2,509,179 TRs=35,720 +Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref3)] SNPs/indels=2,509,179 TRs=35,720 -Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref)] SNPs/indels=2,290,400 TRs=36,199 +Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref3)] SNPs/indels=2,290,400 TRs=36,199 -Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref)] SNPs/indels=2,109,285 TRs=32,336 +Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref3)] SNPs/indels=2,109,285 TRs=32,336 -Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref)] SNPs/indels=2,362,361 TRs=35,451 +Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref3)] SNPs/indels=2,362,361 TRs=35,450 -Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref)] SNPs/indels=2,073,624 TRs=38,377 +Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref3)] SNPs/indels=2,073,624 TRs=38,377 -Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref)] SNPs/indels=1,963,845 TRs=28,444 +Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref3)] SNPs/indels=1,963,845 TRs=28,443 -Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref)] SNPs/indels=1,670,692 TRs=33,536 +Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref3)] SNPs/indels=1,670,692 TRs=33,535 -Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref)] SNPs/indels=1,644,384 TRs=25,743 +Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref3)] SNPs/indels=1,644,384 TRs=25,742 -Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref)] SNPs/indels=1,002,753 TRs=12,894 +Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref3)] SNPs/indels=1,002,753 TRs=12,893 -Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref)] SNPs/indels=1,066,557 TRs=15,643 +Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref3)] SNPs/indels=1,066,557 TRs=15,643 ### Usage @@ -203,7 +203,7 @@ Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impu ``` java -Xmx4g -jar beagle.version.jar \ gt=SNPs_chr${chrom}.vcf.gz \ - ref=ensembletr_refpanel_v3_chr${chrom}.bref \ + ref=ensembletr_refpanel_v3_chr${chrom}.bref3 \ out=imputed_TR_SNPs_chr${chrom} ``` diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 33c2ccd..ccfa5e5 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -153,7 +153,7 @@ def main(args): num_str_failed_ref = 0 num_str_failed_max_allele = 0 num_str_failed_min_allele = 0 - + num_str_duplicated = 0 for record in reader: num_records_processed += 1 if args.max_records > 0 and num_records_processed > args.max_records: @@ -184,6 +184,7 @@ def main(args): locinfo = f"{record.CHROM}:{record.POS}:"+",".join(allele_order) if locinfo in allloci: sys.stderr.write(f"Skipping duplicate {locinfo}\n") + num_str_duplicated += 1 continue else: allloci.add(locinfo) @@ -203,7 +204,7 @@ def main(args): for sample in record.genotypes: out_items.append(GetGT(sample, orig_alleles, allele_order)) writer.write("\t".join([str(item) for item in out_items])+"\n") - sys.stderr.write(f"All {num_records_processed:,} records processed: keep {num_snps_keeped:,} snps, {num_strs_keeped:,} STRs; remove {num_str_failed_ref:,} STRs mismatch with reference, {num_str_failed_max_allele:,} STRs with more than {args.max_alleles:,} alleles, {num_str_failed_min_allele:,} STRs less than {args.min_alleles} alleles.\n") + sys.stderr.write(f"All {num_records_processed:,} records processed: keep {num_snps_keeped:,} snps, {num_strs_keeped:,} STRs; remove {num_str_failed_ref:,} STRs mismatch with reference, {num_str_failed_max_allele:,} STRs with more than {args.max_alleles:,} alleles, {num_str_failed_min_allele:,} STRs less than {args.min_alleles} alleles, {num_str_duplicated:,} duplicated STRs.\n") reader.close() writer.close() sys.exit(0) From 94904c9fdaa78a8f50494a5ea0f26322b6766ab1 Mon Sep 17 00:00:00 2001 From: Melissa Gymrek Date: Sun, 3 Nov 2024 12:58:03 -0800 Subject: [PATCH 26/27] fix issue with REF count 0 in fix ref script --- scripts/fix-ref/fix_ensembletr_snpstr_reference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py index 33c2ccd..4461ae5 100755 --- a/scripts/fix-ref/fix_ensembletr_snpstr_reference.py +++ b/scripts/fix-ref/fix_ensembletr_snpstr_reference.py @@ -47,7 +47,9 @@ def GetAlleleCounts(record): allele_counts = {} for a in alleles: acount = all_allele_calls.count(a) - if acount > 0: allele_counts[a] = acount + # Note: force REF to be in dict even if acount=0 + if acount > 0 or a == record.REF: + allele_counts[a] = acount # Keep order same as original allele_order = [item for item in alleles if item in allele_counts.keys()] return allele_counts, allele_order From ab3044bee36ce1edaf50c46bf75d2bd39b801912 Mon Sep 17 00:00:00 2001 From: Yang_Li Date: Wed, 6 Nov 2024 14:20:43 -0800 Subject: [PATCH 27/27] fix the missing reference allele --- README.md | 57 ++++++++++++++------------- archive_ensembletr_datasets.md | 72 +++++++++++++++++++++++++++++++++- 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 5881c7c..2193f78 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,7 @@ statSTR --vcf EnsembleTR_file.vcf.gz # EnsembleTR data releases -Archived datasets, including the Version II calls and Version II haplotype panel files can be found [here](archive_ensembletr_datasets.md). +Archived datasets, including the Version II calls and other versions of haplotype panel files can be found [here](archive_ensembletr_datasets.md). ## Version II of EnsembleTR calls on samples from 1000 Genomes Project and H3Africa @@ -129,20 +129,21 @@ Chromosome 21 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntr Chromosome 22 [VCF file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr22_filtered.vcf.gz) and [tbi file](https://ensemble-tr.s3.us-east-2.amazonaws.com/add-vntrs/ensemble_chr22_filtered.vcf.gz.tbi) -## Version III of reference SNP+TR haplotype panel for imputation of TR variants +## Version IV of reference SNP+TR haplotype panel for imputation of TR variants These files contain: * [Phased SNP and indel variants](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/) of 3,202 samples from the 1000 Genomes Project (1kGP). * TRs phased/imputed from 3,202 1kGP samples based on EnsembleTR calls. -There are in total 1,070,698 TRs and 70,692,015 SNPs/indels. +There are in total 1,070,762 TRs and 70,692,015 SNPs/indels. All the coordinates are based on **hg38** human reference genome. These files contain the same data as [Version II](archive_ensembletr_datasets.md), with the following updates to facilitate use in downstream imputation pipelines: 1. Remove TRs for which the REF allele does not match the expected sequence based on CHR:POS -2. For each TR, remove alelles with 0 count. +2. For each TR, remove alelles with 0 count. + * If reference allele have 0 count, keep the reference alleles. 3. Remove TRs which have more than 100 alleles. 4. Remove TRs which have less than 2 alleles. 5. Remove the DS/GP fields which are large and not used by downstream steps. @@ -150,51 +151,51 @@ These files contain the same data as [Version II](archive_ensembletr_datasets.md 7. Add VT field, set to VT=TR for TRs and VT=OTHER for other variant types 8. Add the bref format files which have the same information as the VCFs but can improve Beagle imputation performance. -All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). Data and links for each chromosome for the Verson III panel are also provided below. +All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_4_readme.txt). Data and links for each chromosome for the Verson IV panel are also provided below. -Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref3)] SNPs/indels=5,759,060 TRs=92,372 +Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr1.bref3)] SNPs/indels=5,759,060 TRs=92,378 -Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref3)] SNPs/indels=6,088,598 TRs=91,132 +Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr2.bref3)] SNPs/indels=6,088,598 TRs=91,137 -Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref3)] SNPs/indels=4,983,185 TRs=75,233 +Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr3.bref3)] SNPs/indels=4,983,185 TRs=75,243 -Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref3)] SNPs/indels=4,875,465 TRs=69,325 +Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr4.bref3)] SNPs/indels=4,875,465 TRs=69,327 -Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref3)] SNPs/indels=4,536,819 TRs=66,492 +Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr5.bref3)] SNPs/indels=4,536,819 TRs=66,492 -Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref3)] SNPs/indels=4,315,217 TRs=65,937 +Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr6.bref3)] SNPs/indels=4,315,217 TRs=65,940 -Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref3)] SNPs/indels=4,137,254 TRs=59,409 +Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr7.bref3)] SNPs/indels=4,137,254 TRs=59,422 -Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref3)] SNPs/indels=3,886,222 TRs=55,141 +Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr8.bref3)] SNPs/indels=3,886,222 TRs=55,144 -Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref3)] SNPs/indels=3,165,513 TRs=44,188 +Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr9.bref3)] SNPs/indels=3,165,513 TRs=44,189 -Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref3)] SNPs/indels=3,495,473 TRs=51,637 +Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr10.bref3)] SNPs/indels=3,495,473 TRs=51,640 -Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref3)] SNPs/indels=3,423,341 TRs=49,598 +Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr11.bref3)] SNPs/indels=3,423,341 TRs=49,603 -Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref3)] SNPs/indels=3,332,788 TRs=55,883 +Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr12.bref3)] SNPs/indels=3,332,788 TRs=55,887 -Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref3)] SNPs/indels=2,509,179 TRs=35,720 +Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr13.bref3)] SNPs/indels=2,509,179 TRs=35,720 -Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref3)] SNPs/indels=2,290,400 TRs=36,199 +Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr14.bref3)] SNPs/indels=2,290,400 TRs=36,203 -Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref3)] SNPs/indels=2,109,285 TRs=32,336 +Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr15.bref3)] SNPs/indels=2,109,285 TRs=32,338 -Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref3)] SNPs/indels=2,362,361 TRs=35,450 +Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr16.bref3)] SNPs/indels=2,362,361 TRs=35,452 -Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref3)] SNPs/indels=2,073,624 TRs=38,377 +Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr17.bref3)] SNPs/indels=2,073,624 TRs=38,382 -Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref3)] SNPs/indels=1,963,845 TRs=28,443 +Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr18.bref3)] SNPs/indels=1,963,845 TRs=28,446 -Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref3)] SNPs/indels=1,670,692 TRs=33,535 +Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr19.bref3)] SNPs/indels=1,670,692 TRs=33,536 -Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref3)] SNPs/indels=1,644,384 TRs=25,742 +Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr20.bref3)] SNPs/indels=1,644,384 TRs=25,745 -Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref3)] SNPs/indels=1,002,753 TRs=12,893 +Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr21.bref3)] SNPs/indels=1,002,753 TRs=12,894 -Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref3)] SNPs/indels=1,066,557 TRs=15,643 +Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v4/ensembletr_refpanel_v4_chr22.bref3)] SNPs/indels=1,066,557 TRs=15,644 ### Usage @@ -203,7 +204,7 @@ Use [Beagle](https://faculty.washington.edu/browning/beagle/beagle.html) to impu ``` java -Xmx4g -jar beagle.version.jar \ gt=SNPs_chr${chrom}.vcf.gz \ - ref=ensembletr_refpanel_v3_chr${chrom}.bref3 \ + ref=ensembletr_refpanel_v4_chr${chrom}.bref3 \ out=imputed_TR_SNPs_chr${chrom} ``` diff --git a/archive_ensembletr_datasets.md b/archive_ensembletr_datasets.md index 5716122..7ba0f65 100644 --- a/archive_ensembletr_datasets.md +++ b/archive_ensembletr_datasets.md @@ -2,6 +2,76 @@ This page contains links to older versions of ensembleTR files +## Version III of reference SNP+TR haplotype panel for imputation of TR variants + +### Dataset description +These files contain: +* [Phased SNP and indel variants](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/) of 3,202 samples from the 1000 Genomes Project (1kGP). +* TRs phased/imputed from 3,202 1kGP samples based on EnsembleTR calls. + +There are in total 1,070,685 TRs and 70,692,015 SNPs/indels. + +All the coordinates are based on **hg38** human reference genome. + +These files contain the same data as [Version II](archive_ensembletr_datasets.md), with the following updates to facilitate use in downstream imputation pipelines: + +1. Remove TRs for which the REF allele does not match the expected sequence based on CHR:POS +2. For each TR, remove alelles with 0 count. +3. Remove TRs which have more than 100 alleles. +4. Remove TRs which have less than 2 alleles. +5. Remove the DS/GP fields which are large and not used by downstream steps. +6. Add unique IDs for each TR of the format EnsTR:CHROM:POS. For TRs with the same CHR:POS, add the duplicate number of the TR following format: EnsTR:CHROM:POS:Duplicate_num. Duplicated loci with identical alleles are removed. +7. Add VT field, set to VT=TR for TRs and VT=OTHER for other variant types +8. Add the bref format files which have the same information as the VCFs but can improve Beagle imputation performance. + +### Availability + +All file description and download links can be found [here](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_3_readme.txt). Data and links for each chromosome for the Verson IV panel are also provided below. + +Chromosome 1 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr1.bref3)] SNPs/indels=5,759,060 TRs=92,372 + +Chromosome 2 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr2.bref3)] SNPs/indels=6,088,598 TRs=91,132 + +Chromosome 3 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr3.bref3)] SNPs/indels=4,983,185 TRs=75,233 + +Chromosome 4 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr4.bref3)] SNPs/indels=4,875,465 TRs=69,325 + +Chromosome 5 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr5.bref3)] SNPs/indels=4,536,819 TRs=66,492 + +Chromosome 6 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr6.bref3)] SNPs/indels=4,315,217 TRs=65,937 + +Chromosome 7 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr7.bref3)] SNPs/indels=4,137,254 TRs=59,409 + +Chromosome 8 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr8.bref3)] SNPs/indels=3,886,222 TRs=55,141 + +Chromosome 9 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr9.bref3)] SNPs/indels=3,165,513 TRs=44,188 + +Chromosome 10 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr10.bref3)] SNPs/indels=3,495,473 TRs=51,637 + +Chromosome 11 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr11.bref3)] SNPs/indels=3,423,341 TRs=49,598 + +Chromosome 12 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr12.bref3)] SNPs/indels=3,332,788 TRs=55,883 + +Chromosome 13 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr13.bref3)] SNPs/indels=2,509,179 TRs=35,720 + +Chromosome 14 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr14.bref3)] SNPs/indels=2,290,400 TRs=36,199 + +Chromosome 15 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr15.bref3)] SNPs/indels=2,109,285 TRs=32,336 + +Chromosome 16 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr16.bref3)] SNPs/indels=2,362,361 TRs=35,450 + +Chromosome 17 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr17.bref3)] SNPs/indels=2,073,624 TRs=38,377 + +Chromosome 18 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr18.bref3)] SNPs/indels=1,963,845 TRs=28,443 + +Chromosome 19 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr19.bref3)] SNPs/indels=1,670,692 TRs=33,535 + +Chromosome 20 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr20.bref3)] SNPs/indels=1,644,384 TRs=25,742 + +Chromosome 21 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr21.bref3)] SNPs/indels=1,002,753 TRs=12,893 + +Chromosome 22 [[VCF](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz)] [[tbi](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.vcf.gz.tbi)] [[bref](https://ensemble-tr.s3.us-east-2.amazonaws.com/ensembletr-refpanel-v3/ensembletr_refpanel_v3_chr22.bref3)] SNPs/indels=1,066,557 TRs=15,643 + ## Version II of reference SNP+TR haplotype panel for imputation of TR variants ### Dataset description @@ -67,4 +137,4 @@ For version I of EnsembleTR calls, please use https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/split/ensemble_chr"$chr"_filtered.vcf.gz.tbi for tbi file. For version I of phased panels, please use -https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz.csi for tbi file. \ No newline at end of file +https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz for VCF file and https://ensemble-tr.s3.us-east-2.amazonaws.com/phased-split/chr"$chr"_final_SNP_merged.vcf.gz.csi for tbi file.