From 00ac9dac149f2bc3f8c71d72494207e2d0ebdd6f Mon Sep 17 00:00:00 2001 From: rwdavies Date: Fri, 24 Dec 2021 13:31:57 +0000 Subject: [PATCH] Should fix issue 9 --- QUILT/R/hla_prepare_functions.R | 30 +++++++++++-------- QUILT/R/quilt-hla-prepare-reference.R | 1 + .../QUILT_hla_reference_panel_construction.Md | 18 +++++++---- 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/QUILT/R/hla_prepare_functions.R b/QUILT/R/hla_prepare_functions.R index 4c550fb..94d356d 100644 --- a/QUILT/R/hla_prepare_functions.R +++ b/QUILT/R/hla_prepare_functions.R @@ -267,6 +267,7 @@ make_and_save_hla_full_alleles_filled_in <- function( ##gives where each allele comes from, which is reference allele, and strand ##print(hla_region) + if (hla_region %in% supplementary_gene_info[, "gene"]) { print_message(paste0("Processing HLA-", hla_region)) @@ -288,7 +289,6 @@ make_and_save_hla_full_alleles_filled_in <- function( temp=grep("Please",this) this=this[1:(temp-1)] - starts=grep("gDNA",this) ll=getseqs( starts[1]+2, @@ -298,8 +298,8 @@ make_and_save_hla_full_alleles_filled_in <- function( ) starts=c(starts,length(this)+2) -###amount to trim for first codon -####offset=as.double(this[starts[1]+1]) + ## amount to trim for first codon + ## offset=as.double(this[starts[1]+1]) for(k in 2:(length(starts)-1)) { ll=paste(ll,getseqs(starts[k]+2,starts[k+1]-1,paste(hla_region,"[*]",sep=""), this = this),sep="") @@ -308,19 +308,19 @@ make_and_save_hla_full_alleles_filled_in <- function( names(ll)=getnames(starts[1]+2,starts[2]-1,paste(hla_region,"[*]",sep=""), this = this) first_row <- which(names(ll) == matches[, "allele"]) - ##if (first_row != matches[, "first_row"]) { - ## stop("Error in lining up") - ## } + ## if (first_row != matches[, "first_row"]) { + ## stop("Error in lining up") + ## } ##print("...done") -#######find a match + ## find a match temp=matrix(nrow=length(ll),ncol=nchar(ll[1])) for(i in 1:ncol(temp)) temp[,i]=substring(ll,i,i) for(i in 1:ncol(temp)) temp[temp[,i]=="-",i]=temp[1,i] -#####need to alter + ##need to alter ##for(i in 1:ncol(temp)) temp[temp[,i]=="*",i]=temp[1,i] @@ -392,25 +392,29 @@ make_and_save_hla_full_alleles_filled_in <- function( temp=temp[,ourpos>=qq[1] & ourpos<=qq[2]] ourpos=ourpos[ourpos>=qq[1] & ourpos<=qq[2]] -####fill in + ##fill in -####space separated + ##space separated oldvalue=nrow(temp)*ncol(temp) newvalue=sum(temp=="*") + while(oldvalue>newvalue){ + hapstore=haps tempstore=temp -###guess missing values + + ##guess missing values ##print("Total missing still:") ##print(sum(temp=="*")) ##print("Total columns: ") ##print(ncol(haps)) ##print("Processing column to find nearest neighbour:") + for(i in 1:ncol(haps)){ if(sum(temp[i,]=="*")){ - ## if(!i%%20) {print(c(i,sum(temp[i,]=="*")))} + ## if(!i%%20) {print(c(i,sum(temp[i,]=="*")))} ####use haplotypes to find match cond2=(temp[i,]!="*") cond=haps[,i]!=-1 @@ -428,7 +432,7 @@ make_and_save_hla_full_alleles_filled_in <- function( rownames(allelemat)=c("A","C","G","T","-","*") for(k in 1:nrow(allelemat)) allelemat[k,]=colSums(newalleles==rownames(allelemat)[k]) best=1:ncol(allelemat);for(k in 1:ncol(allelemat)) best[k]=rownames(allelemat)[1:5][which(allelemat[1:5,k]==max(allelemat[1:5,k]))[1]] - best[colSums(allelemat[1:5,])==0]="*" + best[colSums(allelemat[1:5,, drop = FALSE])==0]="*" temp[i,!cond2]=best diff --git a/QUILT/R/quilt-hla-prepare-reference.R b/QUILT/R/quilt-hla-prepare-reference.R index 4722b51..1c00ff0 100644 --- a/QUILT/R/quilt-hla-prepare-reference.R +++ b/QUILT/R/quilt-hla-prepare-reference.R @@ -47,6 +47,7 @@ QUILT_HLA_prepare_reference <- function( nCores = 1 ) { + x <- as.list(environment()) command_line <- paste0( "QUILT_HLA_prepare_reference(", diff --git a/example/QUILT_hla_reference_panel_construction.Md b/example/QUILT_hla_reference_panel_construction.Md index 73992ee..0217813 100644 --- a/example/QUILT_hla_reference_panel_construction.Md +++ b/example/QUILT_hla_reference_panel_construction.Md @@ -7,9 +7,9 @@ Here code to generate the provided reference panel data package is provided and Where you want to store the files as you work on them, and where the output files should go ``` -inputs_dir=/data/smew1/rdavies/quilt_hla_2021_09_18/ -test_dir=/data/smew1/rdavies/quilt_hla_2021_09_18/ -reference_package_dir=/data/smew1/rdavies/quilt_hla_reference_panel_files_2021_09_18/ +inputs_dir=/data/smew1/rdavies/quilt_hla_2021_12_24_3430/ +test_dir=/data/smew1/rdavies/quilt_hla_2021_12_24_3430/ +reference_package_dir=/data/smew1/rdavies/quilt_hla_2021_12_24_3430/quilt_hla_reference_panel_files/ mkdir -p ${test_dir} mkdir -p ${inputs_dir} mkdir -p ${reference_package_dir} @@ -36,9 +36,15 @@ sed -i 's/sample population group sex/SAMPLE POP GROUP SEX/g' ${reference_sample Here we download the IPD-IGMT data ``` ## To download IPD-IGMT version 3.39, for example +ipdigmt_link=https://github.com/ANHIG/IMGTHLA/blob/032815608e6312b595b4aaf9904d5b4c189dd6dc/Alignments_Rel_3390.zip?raw=true +## To download IPD-IGMT version 3.43 +ipdigmt_link=https://github.com/ANHIG/IMGTHLA/blob/3430/Alignments_Rel_3430.zip?raw=true + + cd ${test_dir} -wget https://github.com/ANHIG/IMGTHLA/blob/032815608e6312b595b4aaf9904d5b4c189dd6dc/Alignments_Rel_3390.zip?raw=true -mv Alignments_Rel_3390.zip?raw=true Alignments_Rel_3390.zip +wget ${ipdigmt_link} +ipdigmt_filename=`basename ${ipdigmt_link} | sed 's/?raw=true//g'` +mv Alignments_Rel_3430.zip?raw=true ${ipdigmt_filename} ``` Here we download the database of HLA alleles @@ -87,7 +93,7 @@ cd ~/proj/QUILT/ ## change to the directory where you've cloned QUILT --nGen=100 \ --hla_gene_region_file=hla_ancillary_files/hlagenes.txt \ --hla_types_panel=${test_dir}/20181129_HLA_types_full_1000_Genomes_Project_panel.txt \ ---ipd_igmt_alignments_zip_file=${test_dir}Alignments_Rel_3390.zip \ +--ipd_igmt_alignments_zip_file=${test_dir}${ipdigmt_filename} \ --quilt_hla_supplementary_info_file=hla_ancillary_files/quilt_hla_supplementary_info.txt \ --full_regionStart=25587319 \ --full_regionEnd=33629686 \