From afcf295e55c644711ec68c5a095c1b33d3dccf4c Mon Sep 17 00:00:00 2001
From: rwdavies <robertwilliamdavies@gmail.com>
Date: Mon, 3 Jan 2022 20:06:21 +0000
Subject: [PATCH] 1.0.3 Simplify HLA ref panel construction

---
 CHANGELOG.md                                  |    2 +
 QUILT/DESCRIPTION                             |    4 +-
 QUILT/R/hla_prepare_functions.R               | 1223 +++++++++--------
 QUILT/R/hla_prepare_phase_functions.R         |    9 +-
 QUILT/R/quilt-hla-prepare-reference.R         |   64 +-
 QUILT/R/quilt-hla.R                           |   35 +-
 QUILT/man/QUILT_HLA.Rd                        |    6 +-
 QUILT/man/QUILT_HLA_prepare_reference.Rd      |   10 +-
 QUILT_HLA.R                                   |   11 +-
 QUILT_HLA_prepare_reference.R                 |   18 +-
 README.md                                     |    6 +-
 README_QUILT-HLA.md                           |    2 +-
 .../QUILT_hla_reference_panel_construction.Md |   26 +-
 scripts/hla_prepare_workflow.sh               |  167 ++-
 scripts/test-cli.R                            |    2 +-
 15 files changed, 869 insertions(+), 716 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 300c8db..899feae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,5 @@
+* v1.0.3
+	* Simplify how QUILT HLA reference packages are built
 * v1.0.2
 	* Fix minor but that prevented HLA reference panel from building on new reference data
 * v1.0.1
diff --git a/QUILT/DESCRIPTION b/QUILT/DESCRIPTION
index 937a404..01163db 100644
--- a/QUILT/DESCRIPTION
+++ b/QUILT/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: QUILT
 Type: Package
 Title: QUILT
-Version: 1.0.2
-Date: 2021-12-24
+Version: 1.0.3
+Date: 2021-12-26
 Author: Robert William Davies
 Maintainer: Robert William Davies <robertwilliamdavies@gmail.com>
 Description: QUILT performs imputation of individuals sequenced to low coverage
diff --git a/QUILT/R/hla_prepare_functions.R b/QUILT/R/hla_prepare_functions.R
index 94d356d..24dc717 100644
--- a/QUILT/R/hla_prepare_functions.R
+++ b/QUILT/R/hla_prepare_functions.R
@@ -135,9 +135,12 @@ process <- function(varset){
 
 make_and_save_hla_all_alleles_kmers <- function(
     outputdir,
-    ourfiles
+    all_hla_regions,
+    hla_gene_information
 ) {
 
+    ourfiles <- all_hla_regions
+    
     print_message("Begin making HLA all alleles kmers file")
     ## code to make HLAallalleleskmers.out
     
@@ -226,9 +229,11 @@ make_and_save_hla_all_alleles_kmers <- function(
     for(i in 1:length(newnames)) newkmers[i,2]=paste(temps[starts[i]:ends[i],2],collapse=",")
     for(i in 1:length(newnames)) newkmers[i,3]=paste(temps[starts[i]:ends[i],3],collapse=",")
 
+    ## easy enough to save hla_gene_information here
     save(
         kmers,
         newkmers,
+        hla_gene_information,
         file = file_quilt_hla_all_alleles_kmers(outputdir)
     )
 
@@ -244,637 +249,735 @@ make_and_save_hla_all_alleles_kmers <- function(
 
 
 
-make_and_save_hla_full_alleles_filled_in <- function(
+
+
+
+
+make_single_snpformatalleles <- function(
     outputdir,
-    regs,
-    supplementary_gene_info    
+    hla_region,
+    temp,
+    ll,
+    ourpos,
+    ourrow
 ) {
-
-    print_message("Begin making HLA full alleles filled in file")
+        
+    ##varset will store an initial set of variant positions
+    ##print("Calling initial variants, sequences analysed:")
+    varset=matrix(nrow=0,ncol=4)
+    for(i in 1:nrow(temp)) {
+        ##if(!i%%50) print(i)
+        cc=getvars2(temp[ourrow,],temp[i,],ourpos)
+        if(length(cc)) varset=rbind(varset,cbind(matrix(cc,ncol=3),i))
+    }
+    ##print("...done: filtering variants")
     
-    ## code to make HLAxxfullallelesfilledin.out
-    ## depends on hla*snpformatalleles.out from above
-    ## load("hlareglist.out")
-    ##trying the below!
+    cccc=paste(varset[,1],rep("W",nrow(varset)),varset[,2],rep("W",nrow(varset)),varset[,3],rep("W",nrow(varset)),sep="")
+    length(unique(cccc))
+    cccc=unique(cccc)
+    cccc2=cccc
+    cccc=strsplit(cccc,"W")
+    newset=matrix(nrow=length(cccc),ncol=3)
+    for(i in 1:nrow(newset)) newset[i,]=cccc[i][[1]]
+    newset[1:5,]
+    dim(newset)
     
-    ourfiles <- regs
+    ##modify to simpler way of calling mutations
+    varset2=process(newset)
+    knownvars=varset2
+    knownvars=knownvars[order(as.double(knownvars[,1])),]
+    ##this has removed identical bases
+    ##$filter variants where can co-occur depending on calling protocol
     
-    for(zztop in (1:length(ourfiles))){
-
-        hla_region <- ourfiles[zztop]
-        ##now read in for a region, the sequences and adapt them
-        ##load(paste0("hla", hla_region, "pos.out"))
-        ##gives where each allele comes from, which is reference allele, and strand
-        
-        ##print(hla_region)
+    
+    newvars=matrix(nrow=0,ncol=ncol(knownvars))
+    all1=knownvars[,2]
+    all2=knownvars[,3]
+    for(i in 1:length(all1)) if(nchar(all1[i])<nchar(all2[i])){
+                                 
+                                 all1[i]=paste(paste(all1[i],paste(rep(".",nchar(all2[i])-nchar(all1[i])),collapse=""),sep=""),collapse="")
+                             }
+    for(i in 1:length(all1)) if(nchar(all2[i])<nchar(all1[i])){
+                                 
+                                 all2[i]=paste(paste(all2[i],paste(rep(".",nchar(all1[i])-nchar(all2[i])),collapse=""),sep=""),collapse="")
+                             }
+    spos=as.double(knownvars[,1])
+    epos=spos+nchar(all1)-1
+    
+    rm=rep(0,nrow(knownvars))
+    toto=rm
+    for(i in 1:nrow(knownvars)){
         
-        if (hla_region %in% supplementary_gene_info[, "gene"]) {
-
-            print_message(paste0("Processing HLA-", hla_region))
-            matches <- supplementary_gene_info[
-                match(hla_region, supplementary_gene_info[, "gene"]),
-                ]
+        if(nchar(all1[i])>1000 | nchar(all2[i])>1000) rm[i]=1
+        else{
             
-            this <- scan(
-                file.path(
-                    outputdir,
-                    "alignments",
-                    paste0(hla_region, "_gen.txt")
-                ),
-                what = 'char',
-                quiet = TRUE
-            )
-            ##print("Processing input...")
-            
-            temp=grep("Please",this)
-            this=this[1:(temp-1)]
+            overlaps=which(spos<=spos[i] & epos>=epos[i])
+            toto[i]=length(overlaps)
             
-            starts=grep("gDNA",this)
-            ll=getseqs(
-                starts[1]+2,
-                starts[2]-1,
-                paste(hla_region,"[*]",sep=""),
-                this = this
-            )
-            starts=c(starts,length(this)+2)
-            
-            ## amount to trim for first codon
-            ## offset=as.double(this[starts[1]+1])
-            
-            for(k in 2:(length(starts)-1)) {
-                ll=paste(ll,getseqs(starts[k]+2,starts[k+1]-1,paste(hla_region,"[*]",sep=""), this = this),sep="")
-                ##print(k)
-            }
-            names(ll)=getnames(starts[1]+2,starts[2]-1,paste(hla_region,"[*]",sep=""), this = this)
-
-            first_row <- which(names(ll) == matches[, "allele"])
-            ## if (first_row != matches[, "first_row"]) {
-            ##   stop("Error in lining up")
-            ## }
-            
-            ##print("...done")
-            
-            ## find a match
-            
-            temp=matrix(nrow=length(ll),ncol=nchar(ll[1]))
-            
-            for(i in 1:ncol(temp)) temp[,i]=substring(ll,i,i)
-            for(i in 1:ncol(temp)) temp[temp[,i]=="-",i]=temp[1,i]
-            ##need to alter
-            
-            ##for(i in 1:ncol(temp)) temp[temp[,i]=="*",i]=temp[1,i]
-            
-            spos=which(temp[1,]=="|")[1]
-            temp=temp[,(spos+1):ncol(temp)]
-            temp=temp[,temp[1,]!="|"]
-            
-            if(matches[1,"strand"]!=1){
-                temp[temp=="A"]="t"
-                temp[temp=="C"]="g"
-                temp[temp=="G"]="c"
-                temp[temp=="T"]="a"
-                temp[temp=="a"]="A"
-                temp[temp=="c"]="C"
-                temp[temp=="g"]="G"
-                temp[temp=="t"]="T"
-                temp=temp[,ncol(temp):1]
-            }
-            
-            
-            aligned=sum(temp[first_row,] %in% c("A","C","G","T"))
+######first just take subset case, only worry about shorter indels
             
-            ##perfect, now need to align to positions in the reference
-            ourrow <- first_row
-            ourpos=rep(0,ncol(temp))
-
-            if(matches[1,"strand"]==1) {
-                ourpos[1]=matches[1,"genome_pos"]
-            } else {
-                ourpos[1]=matches[1,"genome_pos"]-(aligned-1)
+            if(length(overlaps)>1){
+                overlaps=overlaps[overlaps!=i]
+                for(k in 1:length(overlaps)){
+                    if(length(grep(all1[i],all1[overlaps[k]])) & length(grep(all2[i],all2[overlaps[k]])) & (nchar(all1[overlaps[k]])-nchar(all1[i])<=25 | nchar(knownvars[i,2])!=nchar(knownvars[i,3]))){
+                        
+                        cccc=substring(all1[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
+                        cccc2=substring(all2[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
+                        
+                        if(sum(all1[i]==cccc & all2[i]==cccc2)) rm[i]=1
+                        
+                    }
+                    if(!length(grep(all1[i],all2[overlaps[k]])) & length(grep(all2[i],all1[overlaps[k]])) & 
+                       (nchar(all1[overlaps[k]])-nchar(all1[i])<=25 | nchar(knownvars[i,2])!=nchar(knownvars[i,3]))){
+                        
+                        cccc=substring(all1[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
+                        cccc2=substring(all2[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
+                        
+                        if(sum(all2[i]==cccc & all1[i]==cccc2)) rm[i]=1
+                        
+                        
+                    }
+                }
             }
+            ##partial overlap case
+            overlaps=which(spos[i]<spos & epos[i]>=spos & epos[i]<epos)
+            if(length(overlaps)) rm[i]=1
+            overlaps=which(spos[i]>spos & spos[i]<=epos & epos[i]>epos)
+            if(length(overlaps)) rm[i]=1
             
-            curpos=ourpos[1]
-            
-            for(i in 2:length(ourpos)){
-                if(temp[ourrow,i] %in% c("A","C","G","T","*")) curpos=curpos+1
-                ourpos[i]=curpos
-            }
             
         }
+        ##print(length(overlaps))
+    }
+    
+    knownvarsfiltered=knownvars[rm==0,]
+    ##print("....done:Making variant calls, percentage complete:")
+    ##this is a final set of variant positions after filtering
+    
+    ##use to make calls
+    tempmat=matrix(nrow=0,ncol=7)
+    ##resmat now gives the called type at each of the variant positions
+    resmat=matrix(0,nrow=nrow(temp),ncol=nrow(knownvarsfiltered))
+    
+    for(q in 1:nrow(knownvarsfiltered)){
+        ## if(!q%%50) print(q/nrow(knownvarsfiltered)*100)
+        a1=knownvarsfiltered[q,2]
+        
+        a2=knownvarsfiltered[q,3]
+        pos=as.double(knownvarsfiltered[q,1])
+        start=pos
+        end=pos+nchar(a1)-1
+        flag=0
+        if(end<start){ start=end
+            flag=1
+        }
+        s1=which(ourpos>=start & ourpos<=end)
+        if(flag==1){
+            s1=s1[2:length(s1)]
+        }
+        checkmat=temp[,s1]
+        checkmat[checkmat=="."]=""
+        checkmat=matrix(checkmat,ncol=length(s1))
+        types=rep("",nrow(checkmat))
+        for(i in 1:ncol(checkmat)){
+            types=paste(types,checkmat[,i],sep="")
+        }
+        resmat[types==a2,q]=1
+        rs=grep("[*]",types)
+        if(length(rs)) resmat[rs,q]=-1
+        ##print(c(a1,a2))
+        ##print(table(types))
+        ##print(c(q,sum(resmat[,q])))
+        tempmat=rbind(tempmat,c(a1,a2,length(unique(types)),unique(types)[1:2],q,sum(resmat[,q])))
+    }
+    ##print("...done")
+    ##print("Final clean-up...")
+    ##can be an issue where some variants are never seen, remove these (it is because there is local similarity and more than one correct local aligment so may be inconsisently called)
+    
+    knownvarsfiltered=knownvarsfiltered[colSums(resmat>0)>0,]
+    resmat=resmat[,colSums(resmat>0)>0]
+    rownames(resmat)=names(ll)
+    ##print("...done, saving to file")
+    save(
+        resmat,
+        knownvarsfiltered,
+        file = file_quilt_hla_snpformatalleles(outputdir, hla_region)
+    )
+
+    1
+    
+}
+
+make_single_hla_full_alleles_filled_in <- function(
+    outputdir,
+    hla_region,
+    temp,
+    ll,
+    ourpos,
+    resmat,
+    knownvarsfiltered
+) {
+
+    temp2=list(ol=knownvarsfiltered,ourallelemat=resmat)
+    
+    sample=cbind(rownames(temp2$ourallelemat),rownames(temp2$ourallelemat),0)
+    colnames(sample)=c("ID_1","ID_2","missing")
+    
+    pos=cbind(paste("chr6:",temp2$ol[,1],sep=""),temp2$ol[,1],temp2$ol[,2],temp2$ol[,3])
+    colnames(pos)=c("id","position","a0","a1")
+    
+    ##print("Filtering variants using missing data...")
+    
+    haps=t(temp2$ourallelemat)
+    cond=pos[,3]%in%c("A","C","G","T") & pos[,4]%in%c("A","C","G","T") & rowMeans(haps==-1)<0.1
+    haps=haps[cond ,]
+    pos=pos[cond,]
+    
+    ##print("...done")
+    
+    qq=range(as.double(pos[,2]))
+    qq[1]=qq[1]-10
+    qq[2]=qq[2]+10
+    
+    temp=temp[,ourpos>=qq[1] & ourpos<=qq[2]]
+    ourpos=ourpos[ourpos>=qq[1] & ourpos<=qq[2]]
+    
+    ##fill in
+    
+    ##space separated
+    oldvalue=nrow(temp)*ncol(temp)
+    newvalue=sum(temp=="*")
+    
+    while(oldvalue>newvalue){
         
-        ##end up with sequences ll, and a matrix temp storing all the known alleles
-        
-        ##print_message("Reading in SNP format data for region")
-        load(file_quilt_hla_snpformatalleles(outputdir, hla_region))
-        ##print("...done")
-        
-        temp2=list(ol=knownvarsfiltered,ourallelemat=resmat)
-        
-        sample=cbind(rownames(temp2$ourallelemat),rownames(temp2$ourallelemat),0)
-        colnames(sample)=c("ID_1","ID_2","missing")
-        
-        pos=cbind(paste("chr6:",temp2$ol[,1],sep=""),temp2$ol[,1],temp2$ol[,2],temp2$ol[,3])
-        colnames(pos)=c("id","position","a0","a1")
-        
-        ##print("Filtering variants using missing data...")
-        
-        haps=t(temp2$ourallelemat)
-        cond=pos[,3]%in%c("A","C","G","T") & pos[,4]%in%c("A","C","G","T") & rowMeans(haps==-1)<0.1
-        haps=haps[cond ,]
-        pos=pos[cond,]
-        
-        ##print("...done")
-        
-        qq=range(as.double(pos[,2]))
-        qq[1]=qq[1]-10
-        qq[2]=qq[2]+10
-        
-        temp=temp[,ourpos>=qq[1] & ourpos<=qq[2]]
-        ourpos=ourpos[ourpos>=qq[1] & ourpos<=qq[2]]
-        
-        ##fill in
+        hapstore=haps
+        tempstore=temp
         
-        ##space separated
-        oldvalue=nrow(temp)*ncol(temp)
-        newvalue=sum(temp=="*")
+        ##guess missing values
+        ##print("Total missing still:")
+        ##print(sum(temp=="*"))
+        ##print("Total columns: ")
+        ##print(ncol(haps))
+        ##print("Processing column to find nearest neighbour:")
         
-        while(oldvalue>newvalue){
-            
-            hapstore=haps
-            tempstore=temp
-            
-            ##guess missing values
-            ##print("Total missing still:")
-            ##print(sum(temp=="*"))
-            ##print("Total columns: ")
-            ##print(ncol(haps))
-            ##print("Processing column to find nearest neighbour:")
+        for(i in 1:ncol(haps)){
             
-            for(i in 1:ncol(haps)){
+            if(sum(temp[i,]=="*")){
                 
-                if(sum(temp[i,]=="*")){
-                    
-                    ## if(!i%%20) {print(c(i,sum(temp[i,]=="*")))}
+                ## if(!i%%20) {print(c(i,sum(temp[i,]=="*")))}
 ####use haplotypes to find match
-                    cond2=(temp[i,]!="*")
-                    cond=haps[,i]!=-1
-                    c1=hapstore[cond,i]
-                    c2=hapstore[cond,]-hapstore[cond,i]
-                    dist=0;dist2=0;
-                    if(sum(c1>0)>1) dist=colSums(c2[c1>0,]== -1)
-                    if(sum(c1==0)>1) dist2=colSums(c2[c1==0,]==1)
-                    if(sum(c1>0)==1) dist=as.double(c2[c1>0,]== -1)
-                    if(sum(c1==0)==1) dist2=as.double(c2[c1==0,]==1)
-                    dist=dist+dist2
-                    newalleles=matrix(tempstore[dist==min(dist[-i]),!cond2],ncol=sum(!cond2))
-                    ##newhaps=matrix(hapstore[!cond,dist==min(dist[-i])],nrow=sum(!cond))
-                    allelemat=matrix(nrow=6,ncol=sum(!cond2))
-                    rownames(allelemat)=c("A","C","G","T","-","*")
-                    for(k in 1:nrow(allelemat)) allelemat[k,]=colSums(newalleles==rownames(allelemat)[k])
-                    best=1:ncol(allelemat);for(k in 1:ncol(allelemat)) best[k]=rownames(allelemat)[1:5][which(allelemat[1:5,k]==max(allelemat[1:5,k]))[1]]
-                    best[colSums(allelemat[1:5,, drop = FALSE])==0]="*"
-                    temp[i,!cond2]=best	
-                    
-                    
-                }
+                cond2=(temp[i,]!="*")
+                cond=haps[,i]!=-1
+                c1=hapstore[cond,i]
+                c2=hapstore[cond,]-hapstore[cond,i]
+                dist=0;dist2=0;
+                if(sum(c1>0)>1) dist=colSums(c2[c1>0,]== -1)
+                if(sum(c1==0)>1) dist2=colSums(c2[c1==0,]==1)
+                if(sum(c1>0)==1) dist=as.double(c2[c1>0,]== -1)
+                if(sum(c1==0)==1) dist2=as.double(c2[c1==0,]==1)
+                dist=dist+dist2
+                newalleles=matrix(tempstore[dist==min(dist[-i]),!cond2],ncol=sum(!cond2))
+                ##newhaps=matrix(hapstore[!cond,dist==min(dist[-i])],nrow=sum(!cond))
+                allelemat=matrix(nrow=6,ncol=sum(!cond2))
+                rownames(allelemat)=c("A","C","G","T","-","*")
+                for(k in 1:nrow(allelemat)) allelemat[k,]=colSums(newalleles==rownames(allelemat)[k])
+                best=1:ncol(allelemat);for(k in 1:ncol(allelemat)) best[k]=rownames(allelemat)[1:5][which(allelemat[1:5,k]==max(allelemat[1:5,k]))[1]]
+                best[colSums(allelemat[1:5,, drop = FALSE])==0]="*"
+                temp[i,!cond2]=best	
+                
+                
             }
-            oldvalue=newvalue
-            newvalue=sum(temp=="*")
-            
         }
-        ##print("....done")
-        
-        ##guess missing values
-        ## print_message("Inferring remaining missing values")
+        oldvalue=newvalue
+        newvalue=sum(temp=="*")
         
-        ##print(sum(haps==-1))
-        hapstore=haps
-        tempstore=temp
-        ##print("Processing column:")
-        for(i in 1:ncol(haps)){
-            
-            if(sum(temp[i,]=="*")){
+    }
+    ##print("....done")
+    
+    ##guess missing values
+    ## print_message("Inferring remaining missing values")
+    
+    ##print(sum(haps==-1))
+    hapstore=haps
+    tempstore=temp
+    ##print("Processing column:")
+    for(i in 1:ncol(haps)){
+        
+        if(sum(temp[i,]=="*")){
+            
+            ##print(i)
+            cond=haps[,i]!=-1
+            cond2=(temp[i,]=="*")
+            alter=which(cond2==T)
+            c1=hapstore[cond,i]
+            c2=hapstore[cond,]-hapstore[cond,i]
+            dist=0;dist2=0;
+            if(sum(c1>0)>1) dist=colSums(c2[c1>0,]== -1)
+            if(sum(c1==0)>1) dist2=colSums(c2[c1==0,]==1)
+            if(sum(c1>0)==1) dist=as.double(c2[c1>0,]== -1)
+            if(sum(c1==0)==1) dist2=as.double(c2[c1==0,]==1)
+            dist=dist+dist2
+            for(j in alter){
+                t1=temp[,j]
+                dd=dist[t1!="*"]
+                ##print(min(dd))
+                possalleles=t1[t1!="*"][dd==min(dd)]
+                cc=1:5*0;names(cc)=c("A","C","G","T","-")
+                for(k in 1:5) cc[k]=sum(possalleles==names(cc)[k])
+                temp[i,j]=names(cc)[which(cc==max(cc))[1]]
                 
-		##print(i)
-                cond=haps[,i]!=-1
-		cond2=(temp[i,]=="*")
-		alter=which(cond2==T)
-		c1=hapstore[cond,i]
-		c2=hapstore[cond,]-hapstore[cond,i]
-		dist=0;dist2=0;
-		if(sum(c1>0)>1) dist=colSums(c2[c1>0,]== -1)
-		if(sum(c1==0)>1) dist2=colSums(c2[c1==0,]==1)
-		if(sum(c1>0)==1) dist=as.double(c2[c1>0,]== -1)
-		if(sum(c1==0)==1) dist2=as.double(c2[c1==0,]==1)
-		dist=dist+dist2
-		for(j in alter){
-                    t1=temp[,j]
-                    dd=dist[t1!="*"]
-                    ##print(min(dd))
-                    possalleles=t1[t1!="*"][dd==min(dd)]
-                    cc=1:5*0;names(cc)=c("A","C","G","T","-")
-                    for(k in 1:5) cc[k]=sum(possalleles==names(cc)[k])
-                    temp[i,j]=names(cc)[which(cc==max(cc))[1]]
-                    
-		}
             }
         }
-        ##print("...done")
+    }
+    ##print("...done")
+    
+    ##print("Saving...")
+    fullalleles=temp
+    rownames(fullalleles)=names(ll)
+    
+    save(
+        ourpos,
+        fullalleles,
+        file = file_quilt_hla_full_alleles_filled_in(outputdir, hla_region)
+    )
+
+}
+
+
+
+get_and_reformat_gen_alignments_for_hla_region <- function(
+    outputdir,
+    hla_region,
+    hla_strand
+) {
+
+    this <- scan(
+        file.path(
+            outputdir,
+            "alignments",
+            paste0(hla_region, "_gen.txt")
+        ),
+        what = 'char',
+        quiet = TRUE
+    )
+    
+    temp=grep("Please",this)
+    this=this[1:(temp-1)]
+    
+    starts=grep("gDNA",this)
+    ll=getseqs(
+        starts[1]+2,
+        starts[2]-1,
+        paste(hla_region,"[*]",sep=""),
+        this = this
+    )
+    starts=c(starts,length(this)+2)
+    
+    ##amount to trim for first codon
+    ##offset=as.double(this[starts[1]+1])
+    
+    for(k in 2:(length(starts)-1)) {
+        ll <- paste0(
+            ll,
+            getseqs(
+                starts[k]+2,
+                starts[k+1]-1,
+                paste(hla_region,"[*]",sep=""),
+                this = this
+            )
+        )
+        ##print(k)
+    }
+    names(ll)=getnames(starts[1]+2,starts[2]-1,paste(hla_region,"[*]",sep=""), this = this)
+    
+    
+    ##temp is matrix of alignments, stranded appropriately
+    
+    temp=matrix(nrow=length(ll),ncol=nchar(ll[1]))
+    
+    for(i in 1:ncol(temp)) temp[,i]=substring(ll,i,i)
+    for(i in 1:ncol(temp)) temp[temp[,i]=="-",i]=temp[1,i]
+
+    ## OK looks like the logic was OK here (Simon original)
+    ## not sure what the numbering is then if it is sometimes wrong!
+    ## in any case, easy to calculate
+    spos1 <- which(temp[1,]=="|")[1]
+    offset <- as.numeric(this[grep("gDNA", this)[1] + 1]) * -1
+    ## want the offset + 1 base after removing periods
+    spos2 <- which(cumsum(temp[1, ] != ".") == (offset + 1))
+    
+    n <- c(
+        spos1 = spos1,
+        spos2 = spos2,
+        nperiod1 = sum(temp[1, 1:spos1] == "."),        
+        nperiod2 = sum(temp[1, 1:spos2] == "."),
+        nbar1 = sum(temp[1, 1:spos1] == "|"),        
+        nbar2 = sum(temp[1, 1:spos2] == "|")
+    )
         
-        ##print("Saving...")
-        fullalleles=temp
-        rownames(fullalleles)=names(ll)
         
-        save(
-            ourpos,
-            fullalleles,
-            file = file_quilt_hla_full_alleles_filled_in(outputdir, hla_region)
+    ## remove before the start of CDS
+    temp <- temp[,(spos1 + 1):ncol(temp)]
+    temp <- temp[, temp[1,] != "|"]
+    
+    if(hla_strand != 1){
+        temp[temp=="A"]="t"
+        temp[temp=="C"]="g"
+        temp[temp=="G"]="c"
+        temp[temp=="T"]="a"
+        temp[temp=="a"]="A"
+        temp[temp=="c"]="C"
+        temp[temp=="g"]="G"
+        temp[temp=="t"]="T"
+        temp=temp[,ncol(temp):1]
+    }
+
+    return(
+        list(
+            ll = ll,
+            temp = temp,
+            n = n
         )
-        ##print("...done")
-        
+    )
+}
+
+
+make_single_hla_full <- function(
+    outputdir,
+    hla_region,
+    fullalleles
+) {
+
+    kmers=vector(length=0)
+    positions=vector(length=0)
+    xx=vector(length=0)
+
+    ##print("Making kmers by allele:")
+    for(i in 1:nrow(fullalleles)){
+        ##if(!i%%20) print(i)
+        zz=1:(ncol(fullalleles)-9)
+        ww=fullalleles[i,]
+        zz=zz[ww!="."]
+        ww=ww[ww!="."]
+        dd=paste(ww,collapse="")
+        ee=substring(dd,1:(nchar(dd)-9),10:nchar(dd))
+        ##unique only
+        hh=paste(ee,zz)
+        cond=!(hh %in% xx)
+        if(sum(cond)){
+            ee=ee[cond]
+            zz=zz[cond]
+            hh=hh[cond]
+            kmers=c(kmers,ee)
+            positions=c(positions,zz)
+            xx=c(xx,hh)
+        }
+    }
+    xx=xx[!is.na(kmers)]
+    positions=positions[!is.na(kmers)]
+    kmers=kmers[!is.na(kmers)]
+    ##can save this
+    ##print("..done")
+
+    ##uniquify
+    kk=unique(kmers)
+    names(kk)=kk
+    for(i in 1:length(kk)) kk[i]=paste(positions[kmers==names(kk)[i]],collapse=",")
+
+    ##for each allele, what base is column i in alignment (0 means gap)
+    lookup=matrix(0,nrow=nrow(fullalleles),ncol=ncol(fullalleles))
+    ##for each allele, where is base i in columns of alignment
+    revlookup=lookup
+
+    ##print("Reverse-lookup calculation")
+    for(i in 1:nrow(fullalleles)){
+        t1=1:ncol(fullalleles)
+        t2=fullalleles[i,]
+        t1=t1[t2!="."]
+        lookup[i,t1]=1:length(t1)
+        revlookup[i,1:length(t1)]=t1
     }
+    ##print("....done")
+    ##print("Saving kmers...")
+    save(
+        kmers,
+        positions,
+        xx,
+        revlookup,
+        lookup,
+        fullalleles,
+        file = file_quilt_hla_full(outputdir, hla_region) 
+    )
+    ##print("...done")
 
-    ## end of code to make HLAxxfullallelesfilledin.out
-    print_message("Done making HLA full alleles filled in file")
+    0
     
 }
 
 
-make_quilt_hla_full <- function(
+
+
+make_and_save_hla_files_for_imputation <- function(
     outputdir,
-    regs
+    hla_regions_to_prepare,
+    hla_gene_information,
+    ref_fasta,
+    nCores
 ) {
 
+    ## code to make hla*snpformatalleles.out
+    print_message("Begin making HLA snp format alleles file")
 
-    ##
-    ##code to make hla*full.out
-    ##
-    print_message("Begin making HLA full file")    
-    
-    ## the below creates lists of kmers and their positions for all the HLA alleles at each region and stores it in a file hla*full.out
-    ## it is used for rapid alignment of reads to the overall alignment, and these alignments can then be scored, based on exact 10mer matches
-    ##depends on HLAxxfullallelesfilledin.out above
+    out <- mclapply(1:length(hla_regions_to_prepare), mc.cores = nCores, function(i_region){
 
-    ourfiles <- regs
-    for(k in 1:length(ourfiles)){
+        hla_region <- hla_regions_to_prepare[i_region]
 
-        hla_region <- ourfiles[k]
+        print_message(paste0("Working on region:", hla_region))
 
-        n <- length(ourfiles)
-        if (k %in% (match(1:10, ceiling(10 * (1:n / n))))) {
-            print_message(paste0("Processing file ", k, " out of ", length(ourfiles)))
+        m <- match(paste0("HLA-", hla_region), hla_gene_information[, "Name"])
+        hla_strand <- hla_gene_information[m, "Strand"]
+        if (hla_strand == 1) {
+            genome_pos <- hla_gene_information[m, "Start"]
+        } else {
+            genome_pos <- hla_gene_information[m, "End"]
         }
         
-        ## load(paste("HLA",ourfiles[k],"fullallelesfilledin.out",sep=""))
-        load(file_quilt_hla_full_alleles_filled_in(outputdir, hla_region))
+        out <- get_and_reformat_gen_alignments_for_hla_region(
+            outputdir = outputdir,
+            hla_region = hla_region,
+            hla_strand = hla_strand
+        )
+        ll <- out[["ll"]]
+        temp <- out[["temp"]]
 
-        kmers=vector(length=0)
-        positions=vector(length=0)
-        xx=vector(length=0)
-
-        ##print("Making kmers by allele:")
-        for(i in 1:nrow(fullalleles)){
-            ##if(!i%%20) print(i)
-            zz=1:(ncol(fullalleles)-9)
-            ww=fullalleles[i,]
-            zz=zz[ww!="."]
-            ww=ww[ww!="."]
-            dd=paste(ww,collapse="")
-            ee=substring(dd,1:(nchar(dd)-9),10:nchar(dd))
-            ##unique only
-            hh=paste(ee,zz)
-            cond=!(hh %in% xx)
-            if(sum(cond)){ ee=ee[cond];zz=zz[cond];hh=hh[cond];kmers=c(kmers,ee);positions=c(positions,zz);xx=c(xx,hh);
-            }
+        aligned <- sum(temp[1,] %in% c("A","C","G","T"))
+        
+        if(hla_strand == 1) {
+            start <- genome_pos
+        } else {
+            start <- genome_pos - (aligned - 1)
+        }
+        end <- start + aligned - 1
+        
+        reference_allele <- determine_reference_genome_hla_allele(
+            ref_fasta = ref_fasta,
+            chr = hla_gene_information[1, "Chr"],
+            start = start,
+            end = end,
+            ll = ll,
+            temp = temp,
+            hla_strand = hla_strand
+        )
+
+        print_message(paste0("Determined automatically that the reference genome contains allele:", reference_allele))
+
+        ##
+        ## I don't fully understand this in simon's code
+        ## but now need positions (somehow) with respect to this allele
+        ## does not align obviously to genome, but works later!
+        ##
+        first_row <- which(names(ll) == reference_allele)
+        aligned <- sum(temp[first_row,] %in% c("A","C","G","T"))
+        
+        if(hla_strand == 1) {
+            start <- genome_pos
+        } else {
+            start <- genome_pos - (aligned - 1)
         }
-        xx=xx[!is.na(kmers)]
-        positions=positions[!is.na(kmers)]
-        kmers=kmers[!is.na(kmers)]
-        ##can save this
-        ##print("..done")
-
-        ##uniquify
-        kk=unique(kmers)
-        names(kk)=kk
-        for(i in 1:length(kk)) kk[i]=paste(positions[kmers==names(kk)[i]],collapse=",")
-
-        ##for each allele, what base is column i in alignment (0 means gap)
-        lookup=matrix(0,nrow=nrow(fullalleles),ncol=ncol(fullalleles))
-        ##for each allele, where is base i in columns of alignment
-        revlookup=lookup
-
-        ##print("Reverse-lookup calculation")
-        for(i in 1:nrow(fullalleles)){
-            t1=1:ncol(fullalleles)
-            t2=fullalleles[i,]
-            t1=t1[t2!="."]
-            lookup[i,t1]=1:length(t1)
-            revlookup[i,1:length(t1)]=t1
+            
+        ourpos <- rep(0,ncol(temp))
+        ourpos[1] <- start
+        
+        curpos <- ourpos[1]
+        ourrow <- first_row
+        for(i in 2:length(ourpos)){
+            if(temp[ourrow,i] %in% c("A","C","G","T","*")) {
+                curpos <- curpos + 1
+            }
+            ourpos[i] <- curpos
         }
-        ##print("....done")
-        ##print("Saving kmers...")
-        save(
-            kmers,
-            positions,
-            xx,
-            revlookup,
-            lookup,
-            fullalleles,
-            file = file_quilt_hla_full(outputdir, hla_region) 
+        ## ourpos gives positions relative to the genome reference sequence
+
+        ##
+        ## now make the snpformatalleles and the other one
+        ##
+        print_message(paste0("Make SNP format alleles file for:", hla_region))        
+        out <- make_single_snpformatalleles(
+            outputdir = outputdir,
+            hla_region = hla_region,
+            temp = temp,
+            ll = ll,
+            ourpos = ourpos,
+            ourrow = ourrow
+        )
+        load(file_quilt_hla_snpformatalleles(outputdir, hla_region))
+        
+        print_message(paste0("Make full alleles filled in file for:", hla_region))                
+        make_single_hla_full_alleles_filled_in(
+            outputdir = outputdir,
+            hla_region = hla_region,
+            temp = temp,
+            ll = ll,
+            ourpos = ourpos,
+            resmat = resmat,
+            knownvarsfiltered = knownvarsfiltered
         )
-        ##print("...done")
 
-    }
+        load(file_quilt_hla_full_alleles_filled_in(outputdir, hla_region))
 
-    ## end of code to make hla*full.out
-    print_message("Done making HLA full file")
+        print_message(paste0("Make final files:", hla_region))        
+        make_single_hla_full(
+            outputdir = outputdir,
+            hla_region = hla_region,
+            fullalleles = fullalleles
+        )
+        
+        print_message(paste0("Done working on region:", hla_region))
+
+        0
+        
+    })
+
+    check_mclapply_OK(out)
     
+    print_message("Done making HLA snp format alleles file")    
+
 }
 
 
 
 
 
-make_and_save_hla_snpformatalleles <- function(
-    outputdir,
-    regs,
-    supplementary_gene_info
-) {
 
 
-    ## code to make hla*snpformatalleles.out
-    print_message("Begin making HLA snp format alleles file")
-    
-    ## annotate all the snp positions and positions
-    ## load("hlaseq.out")
-    ## filenames=read.table("temp.out",as.is=T,fill=T)
-    ## ourfiles=filenames[,4];ourfiles=ourfiles[grep("gen.txt",ourfiles)]
-    ## ourfiles <- dir()[grep("gen", dir())]
-    ##ourfiles=gsub("_gen.txt","",ourfiles)
-    ourfiles <- regs
-    
-    for(zztop in c(1:length(ourfiles))){
 
-        hla_region <- ourfiles[zztop]
 
-        if (hla_region %in% supplementary_gene_info[, "gene"]) {
 
-            matches <- supplementary_gene_info[
-                match(hla_region, supplementary_gene_info[, "gene"]),
-                ]
-        
-            print_message(paste0("Working on region:", hla_region))
-            
-            this <- scan(
-                file.path(
-                    outputdir,
-                    "alignments",
-                    paste0(hla_region, "_gen.txt")
-                ),
-                what = 'char',
-                quiet = TRUE
-            )
-            
-            temp=grep("Please",this)
-            this=this[1:(temp-1)]
-            
-            starts=grep("gDNA",this)
-            ll=getseqs(
-                starts[1]+2,
-                starts[2]-1,
-                paste(hla_region,"[*]",sep=""),
-                this = this
-            )
-            starts=c(starts,length(this)+2)
-            
-            ##amount to trim for first codon
-            ##offset=as.double(this[starts[1]+1])
-            
-            for(k in 2:(length(starts)-1)) {
-                ll <- paste0(
-                    ll,
-                    getseqs(
-                        starts[k]+2,
-                        starts[k+1]-1,
-                        paste(hla_region,"[*]",sep=""),
-                        this = this
-                    )
-                )
-                ##print(k)
-            }
-            names(ll)=getnames(starts[1]+2,starts[2]-1,paste(hla_region,"[*]",sep=""), this = this)
 
-            first_row <- which(names(ll) == matches[, "allele"])
-            ## perform check here
-            ##if (first_row != matches[, "first_row"]) {
-            ##    stop("Error in lining up")
-           ## }
-            
-            ##temp is matrix of alignments, stranded appropriately
-            
-            temp=matrix(nrow=length(ll),ncol=nchar(ll[1]))
-            
-            for(i in 1:ncol(temp)) temp[,i]=substring(ll,i,i)
-            for(i in 1:ncol(temp)) temp[temp[,i]=="-",i]=temp[1,i]
-            
-            spos=which(temp[1,]=="|")[1]
-            temp=temp[,(spos+1):ncol(temp)]
-            temp=temp[,temp[1,]!="|"]
-            
-            if(matches[1,"strand"]!=1){
-                temp[temp=="A"]="t"
-                temp[temp=="C"]="g"
-                temp[temp=="G"]="c"
-                temp[temp=="T"]="a"
-                temp[temp=="a"]="A"
-                temp[temp=="c"]="C"
-                temp[temp=="g"]="G"
-                temp[temp=="t"]="T"
-                temp=temp[,ncol(temp):1]
-            }
-            
-            
-            ##where came from
-            
-            ##bases aligning to reference genome
-            aligned <- sum(temp[first_row,] %in% c("A","C","G","T"))
-            
-            ##perfect, now need to align to positions in the reference
-            ourrow=first_row
-            ourpos=rep(0,ncol(temp))
-
-            if(matches[1,"strand"]==1) {
-                ourpos[1]=matches[1,"genome_pos"]
-            } else {
-                ourpos[1]=matches[1,"genome_pos"]-(aligned-1)
-            }
-            
-            curpos <- ourpos[1]
-            for(i in 2:length(ourpos)){
-                if(temp[ourrow,i] %in% c("A","C","G","T","*")) {
-                    curpos <- curpos + 1
-                }
-                ourpos[i]=curpos
-            }
-            ##ourpos gives positions relative to the genome reference sequence
-            
-            ##varset will store an initial set of variant positions
-            ##print("Calling initial variants, sequences analysed:")
-            varset=matrix(nrow=0,ncol=4)
-            for(i in 1:nrow(temp)) {
-                ##if(!i%%50) print(i)
-                cc=getvars2(temp[ourrow,],temp[i,],ourpos)
-                if(length(cc)) varset=rbind(varset,cbind(matrix(cc,ncol=3),i))
-            }
-            ##print("...done: filtering variants")
-            
-            cccc=paste(varset[,1],rep("W",nrow(varset)),varset[,2],rep("W",nrow(varset)),varset[,3],rep("W",nrow(varset)),sep="")
-            length(unique(cccc))
-            cccc=unique(cccc)
-            cccc2=cccc
-            cccc=strsplit(cccc,"W")
-            newset=matrix(nrow=length(cccc),ncol=3)
-            for(i in 1:nrow(newset)) newset[i,]=cccc[i][[1]]
-            newset[1:5,]
-            dim(newset)
-            
-            ##modify to simpler way of calling mutations
-            varset2=process(newset)
-            knownvars=varset2
-            knownvars=knownvars[order(as.double(knownvars[,1])),]
-            ##this has removed identical bases
-            ##$filter variants where can co-occur depending on calling protocol
-            
-            
-            newvars=matrix(nrow=0,ncol=ncol(knownvars))
-            all1=knownvars[,2]
-            all2=knownvars[,3]
-            for(i in 1:length(all1)) if(nchar(all1[i])<nchar(all2[i])){
-                                         
-                                         all1[i]=paste(paste(all1[i],paste(rep(".",nchar(all2[i])-nchar(all1[i])),collapse=""),sep=""),collapse="")
-                                     }
-            for(i in 1:length(all1)) if(nchar(all2[i])<nchar(all1[i])){
-                                         
-                                         all2[i]=paste(paste(all2[i],paste(rep(".",nchar(all1[i])-nchar(all2[i])),collapse=""),sep=""),collapse="")
-                                     }
-            spos=as.double(knownvars[,1])
-            epos=spos+nchar(all1)-1
-            
-            rm=rep(0,nrow(knownvars))
-            toto=rm
-            for(i in 1:nrow(knownvars)){
-		
-		if(nchar(all1[i])>1000 | nchar(all2[i])>1000) rm[i]=1
-		else{
-                    
-                    overlaps=which(spos<=spos[i] & epos>=epos[i])
-                    toto[i]=length(overlaps)
-                    
-######first just take subset case, only worry about shorter indels
-                    
-                    if(length(overlaps)>1){
-			overlaps=overlaps[overlaps!=i]
-			for(k in 1:length(overlaps)){
-                            if(length(grep(all1[i],all1[overlaps[k]])) & length(grep(all2[i],all2[overlaps[k]])) & (nchar(all1[overlaps[k]])-nchar(all1[i])<=25 | nchar(knownvars[i,2])!=nchar(knownvars[i,3]))){
-                                
-                                cccc=substring(all1[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
-                                cccc2=substring(all2[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
-                                
-                                if(sum(all1[i]==cccc & all2[i]==cccc2)) rm[i]=1
-                                
-                            }
-                            if(!length(grep(all1[i],all2[overlaps[k]])) & length(grep(all2[i],all1[overlaps[k]])) & 
-                               (nchar(all1[overlaps[k]])-nchar(all1[i])<=25 | nchar(knownvars[i,2])!=nchar(knownvars[i,3]))){
-                                
-                                cccc=substring(all1[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
-                                cccc2=substring(all2[overlaps[k]],1:(nchar(all1[overlaps[k]])-nchar(all1[i])+1),nchar(all1[i]):nchar(all1[overlaps[k]]))
-                                
-                                if(sum(all2[i]==cccc & all1[i]==cccc2)) rm[i]=1
-                                
-                                
-                            }
-			}
-                    }
-                    ##partial overlap case
-                    overlaps=which(spos[i]<spos & epos[i]>=spos & epos[i]<epos)
-                    if(length(overlaps)) rm[i]=1
-                    overlaps=which(spos[i]>spos & spos[i]<=epos & epos[i]>epos)
-                    if(length(overlaps)) rm[i]=1
-                    
-                    
-                }
-		##print(length(overlaps))
-            }
-            
-            knownvarsfiltered=knownvars[rm==0,]
-            ##print("....done:Making variant calls, percentage complete:")
-            ##this is a final set of variant positions after filtering
-            
-            ##use to make calls
-            tempmat=matrix(nrow=0,ncol=7)
-            ##resmat now gives the called type at each of the variant positions
-            resmat=matrix(0,nrow=nrow(temp),ncol=nrow(knownvarsfiltered))
-            
-            for(q in 1:nrow(knownvarsfiltered)){
-                ## if(!q%%50) print(q/nrow(knownvarsfiltered)*100)
-                a1=knownvarsfiltered[q,2]
-                
-                a2=knownvarsfiltered[q,3]
-                pos=as.double(knownvarsfiltered[q,1])
-                start=pos
-                end=pos+nchar(a1)-1
-                flag=0
-                if(end<start){ start=end
-                    flag=1
-                }
-                s1=which(ourpos>=start & ourpos<=end)
-                if(flag==1){
-                    s1=s1[2:length(s1)]
-                }
-                checkmat=temp[,s1]
-                checkmat[checkmat=="."]=""
-                checkmat=matrix(checkmat,ncol=length(s1))
-                types=rep("",nrow(checkmat))
-                for(i in 1:ncol(checkmat)){
-                    types=paste(types,checkmat[,i],sep="")
-                }
-                resmat[types==a2,q]=1
-                rs=grep("[*]",types)
-                if(length(rs)) resmat[rs,q]=-1
-                ##print(c(a1,a2))
-                ##print(table(types))
-                ##print(c(q,sum(resmat[,q])))
-                tempmat=rbind(tempmat,c(a1,a2,length(unique(types)),unique(types)[1:2],q,sum(resmat[,q])))
-            }
-            ##print("...done")
-            ##print("Final clean-up...")
-            ##can be an issue where some variants are never seen, remove these (it is because there is local similarity and more than one correct local aligment so may be inconsisently called)
-            
-            knownvarsfiltered=knownvarsfiltered[colSums(resmat>0)>0,]
-            resmat=resmat[,colSums(resmat>0)>0]
-            rownames(resmat)=names(ll)
-            ##print("...done, saving to file")
-            save(
-                resmat,
-                knownvarsfiltered,
-                file = file_quilt_hla_snpformatalleles(outputdir, hla_region)
-            )
-            
-        }
-    }
-    ## not ultimately needed but needed to produce other files
-    ## endnd of code to make hla*snpformatalleles.out
 
-    print_message("Done making HLA snp format alleles file")    
 
 
+
+
+
+
+
+
+
+per_entry_in_temp_check_match <- function(temp, refseq) {
+    t(sapply(1:nrow(temp), function(i) {
+        x <- temp[i, ] != "."
+        a <- temp[i, ][x]
+        b <- refseq[1:sum(x)]
+        c <- a != b
+        c(sum(c, na.rm = TRUE), sum(!is.na(c)))
+    }))
 }
 
 
+determine_reference_genome_hla_allele <- function(
+    ref_fasta,
+    chr,
+    start,
+    end,
+    ll,
+    temp,
+    hla_strand,
+    error_check = TRUE
+) {
 
+    if (1 == 0) {
+        ref_fasta <- "/data/smew1/rdavies/quilt_hla_2021_12_24_3430//GRCh38_full_analysis_set_plus_decoy_hla.fa"
+    }
+        
+    command <- paste0(
+        "samtools faidx ",
+        ref_fasta, " ",
+        chr, ":", start, "-", end
+    )
+    seq <- system(command, intern = TRUE)
+    refseq <- unlist(strsplit(seq[-1], ""))
 
+    if (hla_strand == -1) {
+        refseq <- refseq[length(refseq):1]
+        temp <- temp[, ncol(temp):1]
+    }
+    
+    check_entries <- per_entry_in_temp_check_match(temp, refseq)
+
+    x <- which.min(check_entries[, 1])
+    if (sum(check_entries[, 1] == 0) == 0) {
+        if (error_check) {
+            f <- stop
+        } else {
+            f <- warning
+        }
+        f(paste0("Error in automatically determining the HLA allele carried by the reference sequence. No allele is a perfect match to the reference sequence. The nearest match is ", names(ll)[x], " which has ", check_entries[x, 1], " disagreements out of ", check_entries[x, 2], " positions checked"))
+    }
+
+    names(ll)[which.min(check_entries[, 1])]
+    
+}
 
 
 
+get_hla_gene_information <- function(
+    table_file,
+    all_hla_regions,
+    chr,
+    what = "refseq"
+) {
+    ## table_file <- "hla_ancillary_files/GENCODEv38.hg38.chr6.26000000.34000000.txt.gz"
+    ## table_file <- "hla_ancillary_files/refseq.hg38.chr6.26000000.34000000.txt.gz"
+    ## genes <- read.table(gencode_table_file, header = TRUE, comment.char = "")
+    if (what == "refseq") {
+        genename <- "name2"
+        chrom <- "chrom"
+        start <- "cdsStart"
+        end <- "cdsEnd"
+    } else {
+        genename <- "geneName"
+        chrom <- "X.chrom"
+        start <- "thickStart"
+        end <- "thickEnd"
+    }
+    genes <- read.table(table_file, header = TRUE, comment.char = "")    
+    ##
+    x1 <- match(paste0("HLA-", all_hla_regions), genes[, genename])    
+    x2 <- match(paste0(all_hla_regions), genes[, genename])
+    w <- is.na(x1) & !is.na(x2)
+    x1[w] <- x2[w]
+    ## 
+    strand <- c(1, -1)[match(genes[x1[!is.na(x1)], "strand"], c("+", "-"))]
+    hla_gene_regions_new <- data.frame(
+        Name = paste0("HLA-", all_hla_regions[!is.na(x1)]),
+        Chr = genes[x1[!is.na(x1)], chrom],
+        Start = genes[x1[!is.na(x1)], start] + 1,
+        End = genes[x1[!is.na(x1)], end],
+        Strand = strand
+    )
+    hla_gene_regions_new
+}
+
 
+if (1 == 0) {
 
+    ## table_file <- 
+    ## 
+    
+    get_hla_gene_information(
+        table_file = "hla_ancillary_files/refseq.hg38.chr6.26000000.34000000.txt.gz",
+        all_hla_regions,
+        chr,
+        what = "refseq"
+    ) [1:3, ]
+
+    get_hla_gene_information(
+        table_file = "hla_ancillary_files/GENCODEv38.hg38.chr6.26000000.34000000.txt.gz",
+        all_hla_regions,
+        chr,
+        what = "gen"
+    ) [1:3, ]
+    
+    
+}
diff --git a/QUILT/R/hla_prepare_phase_functions.R b/QUILT/R/hla_prepare_phase_functions.R
index cac9d75..182c604 100644
--- a/QUILT/R/hla_prepare_phase_functions.R
+++ b/QUILT/R/hla_prepare_phase_functions.R
@@ -1,7 +1,7 @@
 phase_hla_haplotypes <- function(
     outputdir,
     chr,
-    hla_gene_region_file,
+    hla_gene_information,
     full_regionStart,
     full_regionEnd,
     buffer,
@@ -138,11 +138,10 @@ phase_hla_haplotypes <- function(
     ## what actually changes?
     ## exclude individuals, SNPs?
     ## for now, just operationalize, then fix later?
-    hla_gene_regions <- read.table(hla_gene_region_file, header = TRUE)
     out <- parallel::mclapply(regions, mc.cores = nCores, function(region) {
-        i <- match(paste0("HLA-", region), hla_gene_regions[, "Name"])
-        regionStart <- hla_gene_regions[i, "Start"]
-        regionEnd <- hla_gene_regions[i, "End"]
+        i <- match(paste0("HLA-", region), hla_gene_information[, "Name"])
+        regionStart <- hla_gene_information[i, "Start"]
+        regionEnd <- hla_gene_information[i, "End"]
         ## if (region == "A")    {regionStart=29942554; regionEnd=29945741}
         ## if (region == "B")    {regionStart=31353367; regionEnd=31357155}
         ## if (region == "C")    {regionStart=31268257; regionEnd=31353367}
diff --git a/QUILT/R/quilt-hla-prepare-reference.R b/QUILT/R/quilt-hla-prepare-reference.R
index 1c00ff0..dd56fdb 100644
--- a/QUILT/R/quilt-hla-prepare-reference.R
+++ b/QUILT/R/quilt-hla-prepare-reference.R
@@ -1,10 +1,10 @@
 #' @title QUILT_HLA_prepare_reference
 #' @param outputdir What output directory to use
 #' @param nGen Number of generations since founding or mixing. Note that the algorithm is relatively robust to this. Use nGen = 4 * Ne / K if unsure, where K is number of haplotypes.
-#' @param hla_gene_region_file Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene
 #' @param hla_types_panel Path to file with 1000 Genomes formatted HLA types (see example for format details)
 #' @param ipd_igmt_alignments_zip_file Path to zip file with alignments from IPD-IGMT (see README and example for more details)
-#' @param quilt_hla_supplementary_info_file Path to file with supplementary information about the genes, necessary for proper converstion. File is tab separated with header, with 3 columns. First (allele) is the allele that matches the reference genome. Second (genome_pos) is the position of this allele in the reference genome, finally the strand (strand) (options 1 or -1)
+#' @param ref_fasta Path to reference genome fasta 
+#' @param refseq_table_file Path to file with UCSC refseq gene information (see README and example for more details)
 #' @param full_regionStart When building HLA full reference panel, start of maximal region spanning all HLA genes. The 1-based position x is kept if regionStart <= x <= regionEnd
 #' @param full_regionEnd As above, but end of maximal region spanning all HLA genes
 #' @param buffer Buffer of region to perform imputation over. So imputation is run form regionStart-buffer to regionEnd+buffer, and reported for regionStart to regionEnd, including the bases of regionStart and regionEnd
@@ -26,10 +26,10 @@
 QUILT_HLA_prepare_reference <- function(
     outputdir,
     nGen,
-    hla_gene_region_file,
     hla_types_panel,
     ipd_igmt_alignments_zip_file,
-    quilt_hla_supplementary_info_file,
+    ref_fasta,
+    refseq_table_file,
     full_regionStart,
     full_regionEnd,
     buffer,
@@ -47,6 +47,14 @@ QUILT_HLA_prepare_reference <- function(
     nCores = 1
 ) {
 
+
+    ## DEPRECATED
+    ## ' @param hla_gene_region_file Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene
+    ## ' @param quilt_hla_supplementary_info_file Path to file with supplementary information about the genes, necessary for proper converstion. File is tab separated with header, with 3 columns. First (allele) is the allele that matches the reference genome. Second (genome_pos) is the position of this allele in the reference genome, finally the strand (strand) (options 1 or -1)
+
+    ## INTEGRATED
+    ## ref_fasta <- "/data/smew1/rdavies/quilt_hla_2021_12_24_3430//GRCh38_full_analysis_set_plus_decoy_hla.fa"
+    ## refseq_table_file <- "hla_ancillary_files/refseq.hg38.chr6.26000000.34000000.txt.gz"
     
     x <- as.list(environment())
     command_line <- paste0(
@@ -56,54 +64,37 @@ QUILT_HLA_prepare_reference <- function(
     )
     print_message(paste0("Running ", command_line))
 
-    
-    ## outputdir = "~/proj/QUILT/hla-data/",
-    ## hla_zip_file = "~/proj/QUILT/HLA_input_2021_01_04.zip"
-    
     system(paste0("rsync -av ", ipd_igmt_alignments_zip_file, " ", outputdir))
     system(paste0("cd ", outputdir, " && unzip -u ", basename(ipd_igmt_alignments_zip_file)))
 
-    
-    regions <- hla_regions_to_prepare
-
-
-    supplementary_gene_info <- read.table(quilt_hla_supplementary_info_file, header = TRUE)
-    ## colnames(supplementary_gene_info) <- c("allele", "first_row", "genome_pos", "strand")
-    supplementary_gene_info[, "gene"] <- sapply(strsplit(supplementary_gene_info[, "allele"], "*", fixed = TRUE), function(x) x[1])
-
-
-    hla_regions <- all_hla_regions
-    regs <- hla_regions_to_prepare
-    ourfiles <- hla_regions
-
-    make_and_save_hla_all_alleles_kmers(
-        outputdir = outputdir,
-        ourfiles = ourfiles
-    )
-    
-    make_and_save_hla_snpformatalleles(
-        outputdir = outputdir,
-        regs = regs,
-        supplementary_gene_info = supplementary_gene_info
+    hla_gene_information <- get_hla_gene_information(
+        table_file = refseq_table_file,
+        all_hla_regions = all_hla_regions,
+        chr = chr,
+        what = "refseq"
     )
 
-    make_and_save_hla_full_alleles_filled_in (
+    make_and_save_hla_all_alleles_kmers(
         outputdir = outputdir,
-        regs = regs,
-        supplementary_gene_info = supplementary_gene_info
+        all_hla_regions = all_hla_regions,
+        hla_gene_information = hla_gene_information
     )
 
-    make_quilt_hla_full(
+    make_and_save_hla_files_for_imputation(
         outputdir = outputdir,
-        regs = regs
+        hla_regions_to_prepare = hla_regions_to_prepare,
+        hla_gene_information = hla_gene_information,
+        ref_fasta = ref_fasta,
+        nCores = nCores
     )
 
     ## this is somewhat distinct in flavour, uses above, but also HRC haplotypes
     ## could remove steps and put in here conceivably, or wrap up above
+    regions <- hla_regions_to_prepare
     phase_hla_haplotypes(    
         outputdir = outputdir,
         chr = chr,
-        hla_gene_region_file = hla_gene_region_file,
+        hla_gene_information = hla_gene_information,
         full_regionStart = full_regionStart,
         full_regionEnd = full_regionEnd,
         buffer = buffer,
@@ -121,7 +112,6 @@ QUILT_HLA_prepare_reference <- function(
         nCores = nCores
     )
 
-    
     print_message("Done preparing HLA reference")
     
 }
diff --git a/QUILT/R/quilt-hla.R b/QUILT/R/quilt-hla.R
index 4f784a4..25b6416 100644
--- a/QUILT/R/quilt-hla.R
+++ b/QUILT/R/quilt-hla.R
@@ -1,8 +1,8 @@
 #' @title QUILT_HLA
 #' @param bamlist Path to file with bam file locations. File is one row per entry, path to bam files. Bam index files should exist in same directory as for each bam, suffixed either .bam.bai or .bai
 #' @param region HLA region to be analyzed, for example A for HLA-A
-#' @param hla_gene_region_file Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene
 #' @param dict_file Path to dictionary file for reference build
+#' @param hla_gene_region_file For reference packages built after QUILT 1.0.2, this is not used. For older reference packages, this is needed, and is a path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene
 #' @param outputdir What output directory to use. Otherwise defaults to current directory
 #' @param summary_output_file_prefix Prefix for output text summary files
 #' @param nCores How many cores to use
@@ -23,8 +23,8 @@
 QUILT_HLA <- function(
     bamlist,
     region,
-    hla_gene_region_file,
     dict_file,
+    hla_gene_region_file = NULL,
     outputdir = "",
     summary_output_file_prefix = 'quilt.hla.output',
     nCores = 1,
@@ -105,20 +105,9 @@ QUILT_HLA <- function(
 
     
     ##
-    ## pre-made file, with boundaries
+    ## Load various pre-made files
     ##
     print_message("Load input files")
-    ## load(file.path(prepared_hla_reference_dir, "hlageneboundaries.out"))
-    ## regstart <- ourpos[region,1]
-    ## regend <- ourpos[region,2]
-    if (!file.exists(hla_gene_region_file)) {
-        stop(paste0("Cannot find file with HLA gene boundaries (hla_gene_region_file):", hla_gene_region_file))
-    }
-    ourpos2 <- read.table(hla_gene_region_file, header = TRUE)
-    regstart <- ourpos2[ourpos2[, "Name"] == paste0("HLA-", region), "Start"]
-    regend <- ourpos2[ourpos2[, "Name"] == paste0("HLA-", region), "End"]
-    regmid <- (regstart + regend) / 2
-
 
     ##
     ## inputs from sequences
@@ -127,10 +116,26 @@ QUILT_HLA <- function(
     load(file_quilt_hla_all_alleles_kmers(prepared_hla_reference_dir))
     load(file_quilt_hla_full_alleles_filled_in(prepared_hla_reference_dir, region))
     load(file_quilt_hla_full(prepared_hla_reference_dir, region))
-
     ## load(file.path(ancillary_file_dir, "HLAallalleleskmers.out")) ## from simon file, now incorporated
     ##load(file.path(ancillary_file_dir, paste0("HLA", region, "fullallelesfilledin.out"))) ## from simon file, now incorporated
     ## load(file.path(ancillary_file_dir, paste0("hla", region, "full.out"))) ## from simon file, now incorporated
+    
+    if (!exists("hla_gene_information")) {
+        if (!file.exists(hla_gene_region_file)) {
+            stop(paste0("An older reference package is being used, so you need to supply the file with gene boundaries. Cannot find file with HLA gene boundaries (hla_gene_region_file):", hla_gene_region_file))
+        }
+        ourpos2 <- read.table(hla_gene_region_file, header = TRUE)
+        regstart <- ourpos2[ourpos2[, "Name"] == paste0("HLA-", region), "Start"]
+        regend <- ourpos2[ourpos2[, "Name"] == paste0("HLA-", region), "End"]
+        regmid <- (regstart + regend) / 2
+    } else {
+        regstart <- hla_gene_information[hla_gene_information[, "Name"] == paste0("HLA-", region), "Start"]
+        regend <- hla_gene_information[hla_gene_information[, "Name"] == paste0("HLA-", region), "End"]        
+        regmid <- (regstart + regend) / 2
+    }
+    
+
+
 
 
     ##
diff --git a/QUILT/man/QUILT_HLA.Rd b/QUILT/man/QUILT_HLA.Rd
index a6a6492..4c0bb23 100644
--- a/QUILT/man/QUILT_HLA.Rd
+++ b/QUILT/man/QUILT_HLA.Rd
@@ -7,8 +7,8 @@
 QUILT_HLA(
   bamlist,
   region,
-  hla_gene_region_file,
   dict_file,
+  hla_gene_region_file = NULL,
   outputdir = "",
   summary_output_file_prefix = "quilt.hla.output",
   nCores = 1,
@@ -31,10 +31,10 @@ QUILT_HLA(
 
 \item{region}{HLA region to be analyzed, for example A for HLA-A}
 
-\item{hla_gene_region_file}{Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene}
-
 \item{dict_file}{Path to dictionary file for reference build}
 
+\item{hla_gene_region_file}{For reference packages built after QUILT 1.0.2, this is not used. For older reference packages, this is needed, and is a path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene}
+
 \item{outputdir}{What output directory to use. Otherwise defaults to current directory}
 
 \item{summary_output_file_prefix}{Prefix for output text summary files}
diff --git a/QUILT/man/QUILT_HLA_prepare_reference.Rd b/QUILT/man/QUILT_HLA_prepare_reference.Rd
index 41c7afa..2f7c70e 100644
--- a/QUILT/man/QUILT_HLA_prepare_reference.Rd
+++ b/QUILT/man/QUILT_HLA_prepare_reference.Rd
@@ -7,10 +7,10 @@
 QUILT_HLA_prepare_reference(
   outputdir,
   nGen,
-  hla_gene_region_file,
   hla_types_panel,
   ipd_igmt_alignments_zip_file,
-  quilt_hla_supplementary_info_file,
+  ref_fasta,
+  refseq_table_file,
   full_regionStart,
   full_regionEnd,
   buffer,
@@ -36,13 +36,13 @@ QUILT_HLA_prepare_reference(
 
 \item{nGen}{Number of generations since founding or mixing. Note that the algorithm is relatively robust to this. Use nGen = 4 * Ne / K if unsure, where K is number of haplotypes.}
 
-\item{hla_gene_region_file}{Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene}
-
 \item{hla_types_panel}{Path to file with 1000 Genomes formatted HLA types (see example for format details)}
 
 \item{ipd_igmt_alignments_zip_file}{Path to zip file with alignments from IPD-IGMT (see README and example for more details)}
 
-\item{quilt_hla_supplementary_info_file}{Path to file with supplementary information about the genes, necessary for proper converstion. File is tab separated with header, with 3 columns. First (allele) is the allele that matches the reference genome. Second (genome_pos) is the position of this allele in the reference genome, finally the strand (strand) (options 1 or -1)}
+\item{ref_fasta}{Path to reference genome fasta}
+
+\item{refseq_table_file}{Path to file with UCSC refseq gene information (see README and example for more details)}
 
 \item{full_regionStart}{When building HLA full reference panel, start of maximal region spanning all HLA genes. The 1-based position x is kept if regionStart <= x <= regionEnd}
 
diff --git a/QUILT_HLA.R b/QUILT_HLA.R
index e2814d9..1905613 100755
--- a/QUILT_HLA.R
+++ b/QUILT_HLA.R
@@ -15,14 +15,15 @@ option_list <- list(
         help = "HLA region to be analyzed, for example A for HLA-A"
     ), 
     make_option(
-        "--hla_gene_region_file",
+        "--dict_file",
         type = "character",
-        help = "Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene"
+        help = "Path to dictionary file for reference build"
     ), 
     make_option(
-        "--dict_file",
+        "--hla_gene_region_file",
         type = "character",
-        help = "Path to dictionary file for reference build"
+        help = "For reference packages built after QUILT 1.0.2, this is not used. For older reference packages, this is needed, and is a path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene [default NULL] ",
+        default = NULL
     ), 
     make_option(
         "--outputdir",
@@ -115,8 +116,8 @@ Sys.setenv(PATH = paste0(Sys.getenv("PATH"), ":", getwd()))
 QUILT_HLA(
     bamlist = opt$bamlist,
     region = opt$region,
-    hla_gene_region_file = opt$hla_gene_region_file,
     dict_file = opt$dict_file,
+    hla_gene_region_file = opt$hla_gene_region_file,
     outputdir = opt$outputdir,
     summary_output_file_prefix = opt$summary_output_file_prefix,
     nCores = opt$nCores,
diff --git a/QUILT_HLA_prepare_reference.R b/QUILT_HLA_prepare_reference.R
index 27608fb..6e7ac31 100755
--- a/QUILT_HLA_prepare_reference.R
+++ b/QUILT_HLA_prepare_reference.R
@@ -14,11 +14,6 @@ option_list <- list(
         type = "double",
         help = "Number of generations since founding or mixing. Note that the algorithm is relatively robust to this. Use nGen = 4 * Ne / K if unsure, where K is number of haplotypes."
     ), 
-    make_option(
-        "--hla_gene_region_file",
-        type = "character",
-        help = "Path to file with gene boundaries. 4 columns, named Name Chr Start End, with respectively gene name (e.g. HLA-A), chromsome (e.g. chr6), and 1 based start and end positions of gene"
-    ), 
     make_option(
         "--hla_types_panel",
         type = "character",
@@ -30,9 +25,14 @@ option_list <- list(
         help = "Path to zip file with alignments from IPD-IGMT (see README and example for more details)"
     ), 
     make_option(
-        "--quilt_hla_supplementary_info_file",
+        "--ref_fasta",
+        type = "character",
+        help = "Path to reference genome fasta"
+    ), 
+    make_option(
+        "--refseq_table_file",
         type = "character",
-        help = "Path to file with supplementary information about the genes, necessary for proper converstion. File is tab separated with header, with 3 columns. First (allele) is the allele that matches the reference genome. Second (genome_pos) is the position of this allele in the reference genome, finally the strand (strand) (options 1 or -1)"
+        help = "Path to file with UCSC refseq gene information (see README and example for more details)"
     ), 
     make_option(
         "--full_regionStart",
@@ -124,10 +124,10 @@ Sys.setenv(PATH = paste0(Sys.getenv("PATH"), ":", getwd()))
 QUILT_HLA_prepare_reference(
     outputdir = opt$outputdir,
     nGen = opt$nGen,
-    hla_gene_region_file = opt$hla_gene_region_file,
     hla_types_panel = opt$hla_types_panel,
     ipd_igmt_alignments_zip_file = opt$ipd_igmt_alignments_zip_file,
-    quilt_hla_supplementary_info_file = opt$quilt_hla_supplementary_info_file,
+    ref_fasta = opt$ref_fasta,
+    refseq_table_file = opt$refseq_table_file,
     full_regionStart = opt$full_regionStart,
     full_regionEnd = opt$full_regionEnd,
     buffer = opt$buffer,
diff --git a/README.md b/README.md
index acceabb..5cb9cd4 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 QUILT
 =====
-**__Current Version: 1.0.2__**
-Release date: Dec 24, 2021
+**__Current Version: 1.0.3__**
+Release date: Dec 26, 2021
 
 [![Build Status](https://img.shields.io/travis/rwdavies/QUILT/master.svg)](https://travis-ci.com/rwdavies/QUILT/)
 
 Changes in latest version
 
-1. Fix minor but that prevented HLA reference panel from building on new reference data
+1. Simplify how QUILT HLA reference packages are built
 
 For details of past changes please see [CHANGELOG](CHANGELOG.md).
 
diff --git a/README_QUILT-HLA.md b/README_QUILT-HLA.md
index 8745140..5d1b17b 100644
--- a/README_QUILT-HLA.md
+++ b/README_QUILT-HLA.md
@@ -122,4 +122,4 @@ http://www.stats.ox.ac.uk/~rdavies/QUILT_HLA_reference_package_2021_12_24.tar
 
 ## Preparing a reference package <a name="paragraph-preparing"></a>
 
-An example of this is presented in detail in [example/QUILT_hla_reference_panel_construction.Md](example/QUILT_hla_reference_panel_construction.Md), which was used to make the reference panel package from 1000 Genomes Project data presented above.
+An example of this is presented in detail in [example/QUILT_hla_reference_panel_construction.Md](example/QUILT_hla_reference_panel_construction.Md), which was used to make the reference panel package from 1000 Genomes Project data presented above. This file can also be run non-interactively, using `bash example/run_example.sh example/QUILT_hla_reference_panel_construction.Md`, and run multiple times using `scripts/hla_prepare_workflow.sh`.
diff --git a/example/QUILT_hla_reference_panel_construction.Md b/example/QUILT_hla_reference_panel_construction.Md
index 8db2310..c6e840e 100644
--- a/example/QUILT_hla_reference_panel_construction.Md
+++ b/example/QUILT_hla_reference_panel_construction.Md
@@ -22,11 +22,15 @@ Here we show how we get reference haplotype data, in this case 1000 Genomes proj
 ```
 cd ${inputs_dir}
 oneKG_vcf_name=CCDG_14151_B01_GRM_WGS_2020-08-05_chr6.filtered.shapeit2-duohmm-phased.vcf.gz
-wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/${oneKG_vcf_name}
-wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/${oneKG_vcf_name}.tbi
-bcftools view --output-file oneKG.temp.vcf.gz --output-type z --min-alleles 2 --max-alleles 2 --types snps ${oneKG_vcf_name} chr6:25000000-34000000
-tabix oneKG.temp.vcf.gz
-bcftools convert --haplegendsample oneKG oneKG.temp.vcf.gz
+#wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/${oneKG_vcf_name}
+#wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_phased/${oneKG_vcf_name}.tbi
+
+rsync -av /data/smew1/rdavies/quilt_hla_2021_12_24_3430/one* ${inputs_dir}
+rsync -av /data/smew1/rdavies/quilt_hla_2021_12_24_3430/*vcf* ${inputs_dir}
+
+#bcftools view --output-file oneKG.temp.vcf.gz --output-type z --min-alleles 2 --max-alleles 2 --types snps ${oneKG_vcf_name} chr6:25000000-34000000
+#tabix oneKG.temp.vcf.gz
+#bcftools convert --haplegendsample oneKG oneKG.temp.vcf.gz
 reference_haplotype_file=${inputs_dir}oneKG.hap.gz
 reference_legend_file=${inputs_dir}oneKG.legend.gz
 reference_sample_file=${inputs_dir}oneKG.samples
@@ -40,10 +44,11 @@ Here we download the IPD-IGMT data. These links can be determined from the IPD-I
 ipdigmt_link=https://github.com/ANHIG/IMGTHLA/blob/032815608e6312b595b4aaf9904d5b4c189dd6dc/Alignments_Rel_3390.zip?raw=true
 
 cd ${test_dir}
-wget ${ipdigmt_link}
+#wget ${ipdigmt_link}
+rsync -av /data/smew1/rdavies/quilt_hla_2021_12_24_3430/Align* ${inputs_dir}
 ipdigmt_filename_extra=`basename ${ipdigmt_link}`
 ipdigmt_filename=`basename ${ipdigmt_link} | sed 's/?raw=true//g'`
-mv ${ipdigmt_filename_extra} ${ipdigmt_filename}
+#mv ${ipdigmt_filename_extra} ${ipdigmt_filename}
 ```
 
 Here we download the database of HLA alleles
@@ -101,8 +106,9 @@ We need the reference genome to determine which allele is carried by the referen
 
 ```
 cd ${inputs_dir}
-wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
-samtools faidx GRCh38_full_analysis_set_plus_decoy_hla.fa
+#wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
+#samtools faidx GRCh38_full_analysis_set_plus_decoy_hla.fa
+rsync -av /data/smew1/rdavies/quilt_hla_2021_12_24_3430/GRCh38* ${inputs_dir}
 ref_fasta=${inputs_dir}GRCh38_full_analysis_set_plus_decoy_hla.fa
 ```
 
@@ -147,7 +153,7 @@ wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_refere
 Here we download some example 1000 Genomes data, using links given in an index file. Since we're actually downloading CRAM files, we'll use the reference genome we downloaded earlier to turn them into bams
 ```
 cd ${inputs_dir}
-wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
+#wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
 samtools faidx GRCh38_full_analysis_set_plus_decoy_hla.fa
 ref_fasta=${inputs_dir}GRCh38_full_analysis_set_plus_decoy_hla.fa
 
diff --git a/scripts/hla_prepare_workflow.sh b/scripts/hla_prepare_workflow.sh
index f9dd18d..b6947d3 100644
--- a/scripts/hla_prepare_workflow.sh
+++ b/scripts/hla_prepare_workflow.sh
@@ -1,75 +1,122 @@
 set -e
 
-cd ~/proj/QUILT/
+QUILT_DIR=~/proj/QUILT/
+cd "${QUILT_DIR}"
 
+SAVE_DIR="/data/smew1/rdavies/quilt_hla_packages/"
+    
 set -e
 
-##
-## WITH exclusion of samples
-##
-output_date=2021_04_08B
-script=example/reference_panel_with_exclusion.sh
-rm -f ${script}
-./example/run_example.sh example/QUILT_hla_reference_panel_construction.Md ${script}
-bash ${script}
-## Once done, make tar-ball of required outputs
-## grab directory specified in above
-reference_package_dir=`cat ${script} | grep reference_package_dir= | sed 's/reference_package_dir=//g'`
-current_dir=`pwd`
-cd ${reference_package_dir}
-rm -f quilt.hrc.hla.all.haplotypes.RData
-cd ..
-temp=`basename ${reference_package_dir}`
-temp2=`echo "${temp}/*.RData"`
-tar -cvf QUILT_HLA_reference_package_samples_excluded_${output_date}.tar ${temp2}
-chmod 755 QUILT_HLA_reference_package_samples_excluded_${output_date}.tar
-rsync -av QUILT_HLA_reference_package_samples_excluded_${output_date}.tar ~/pub_html/
-cd ${current_dir}
-
-## also bams
-inputs_dir=`cat ${script} | grep inputs_dir= | sed 's/inputs_dir=//g'`
-current_dir=`pwd`
-cd ${inputs_dir}
-cat bamlist.txt | xargs -l basename > bamlist2.txt
-mv bamlist2.txt bamlist.txt
-tar -cvf QUILT_HLA_example_bams_${output_date}.tar *2.0X*bam* bamlist.txt
-chmod 755 QUILT_HLA_example_bams_${output_date}.tar
-rsync -av QUILT_HLA_example_bams_${output_date}.tar ~/pub_html/
+output_date=2021_12_28
 
+ipdigmt_version=3.39
+ipdigmt_link=https://github.com/ANHIG/IMGTHLA/blob/032815608e6312b595b4aaf9904d5b4c189dd6dc/Alignments_Rel_3390.zip?raw=true
 
+ipdigmt_version=3.43
+ipdigmt_link=https://github.com/ANHIG/IMGTHLA/blob/3430/Alignments_Rel_3430.zip?raw=true
 
 
 ##
-## WITHOUT exclusion of samples
+## change some things here, potentially
 ##
-output_date=2021_04_09
-script=example/reference_panel_no_exclusion.sh
-rm -f ${script}
-./example/run_example.sh example/QUILT_hla_reference_panel_construction.Md ${script}
-## remove lines about exclusion, replace with empty file
-where=`grep -n "exclude NA12878 and two ASW samples for example usage below" ${script} | cut -f1 --delimiter=":"`
-cat ${script} | 
-    awk '{if(NR=='${where}') {print "echo  > ${test_dir}exclude_ref_samples_for_testing.txt"} else {print $0}}' | 
-    awk '{if(NR=='${where}' + 1) {print ""}else {print $0}}' | 
-    awk '{if(NR=='${where}' + 2) {print ""}else {print $0}}' | 
-    awk '{if(NR=='${where}' + 3) {print ""}else {print $0}}' > ${script}.temp
-mv ${script}.temp ${script}
-## run here
-bash example/reference_panel_no_exclusion.sh
-
-reference_package_dir=`cat ${script} | grep reference_package_dir= | sed 's/reference_package_dir=//g'`
-current_dir=`pwd`
-cd ${reference_package_dir}
-rm -f quilt.hrc.hla.all.haplotypes.RData
-cd ..
-temp=`basename ${reference_package_dir}`
-temp2=`echo "${temp}/*.RData"`
-tar -cvf QUILT_HLA_reference_package_${output_date}.tar ${temp2}
-chmod 755 QUILT_HLA_reference_package_${output_date}.tar
-rsync -av QUILT_HLA_reference_package_${output_date}.tar ~/pub_html/
-cd ${current_dir}
-
 
+for i in $(seq 2 3)
+do
+
+    echo ========================= i=${i} ======================== 
+
+    cd ${QUILT_DIR}
+    script=example/reference_panel_builder.sh
+    rm -f ${script}
+    ./example/run_example.sh example/QUILT_hla_reference_panel_construction.Md ${script}
+
+    if [ ${i} == 1 ]
+    then
+	##
+	## WITH exclusion of 3 samples, for demonstration purposes
+	## the default behaviour of the example script
+	##
+	exclude_set="demonstration"
+    elif  [ ${i} == 2 ]
+    then
+	##
+	## WITH exclusion of many samples, for benchmarking
+	##
+	exclude_set="benchmarking"
+	where=`grep -n "exclude NA12878 and two ASW samples for example usage below" ${script} | cut -f1 --delimiter=":"`
+	cat ${script} | 
+	    awk '{if(NR=='${where}' + 3) {print ""}else {print $0}}' | 
+	    awk '{if(NR=='${where}' + 4) {print ""}else {print $0}}' | 
+	    awk '{if(NR=='${where}' + 5) {print ""}else {print $0}}' > ${script}.temp
+	mv ${script}.temp ${script}
+	## now replace the file too
+	where=`grep -n "^exclude_sample_list" ${script} | cut -f1 --delimiter=":"`
+	exclude_sample_list="~/proj/QUILT/hla_ancillary_files/hla_samples_to_exclude.txt"
+	what=`echo ${where} i exclude_sample_list=${exclude_sample_list}`
+	awk '{if(FNR=='${where}') {print "exclude_sample_list='${exclude_sample_list}'"} else {print $0}}' ${script} > ${script}.temp
+	mv ${script}.temp ${script}
+    elif  [ ${i} == 3 ]
+    then
+	##
+	## NO exclusion of samples, for released use
+	##
+	exclude_set="full"
+	## same as above, but no insertion of new file, use blank one
+	where=`grep -n "exclude NA12878 and two ASW samples for example usage below" ${script} | cut -f1 --delimiter=":"`
+	cat ${script} | 
+	    awk '{if(NR=='${where}' + 3) {print ""}else {print $0}}' | 
+	    awk '{if(NR=='${where}' + 4) {print ""}else {print $0}}' | 
+	    awk '{if(NR=='${where}' + 5) {print ""}else {print $0}}' > ${script}.temp
+	mv ${script}.temp ${script}
+    fi
+
+    ## change output date
+    where=`grep -n "^output_date" ${script} | cut -f1 --delimiter=":"`
+    what=`echo ${where} i output_date=${output_date}`    
+    awk '{if(FNR=='${where}') {print "output_date='${output_date}'"} else {print $0}}' ${script} > ${script}.temp
+    mv ${script}.temp ${script}
+    ## change download link
+    where=`grep -n "^ipdigmt_link" ${script} | cut -f1 --delimiter=":"`
+    what=`echo ${where} i ipdigmt_link=${ipdigmt_link}`    
+    awk '{if(FNR=='${where}') {print "ipdigmt_link='${ipdigmt_link}'"} else {print $0}}' ${script} > ${script}.temp
+    mv ${script}.temp ${script}
+
+    ## run!
+    source ${script} ## AM HERE - DID THIS WORK? FINISH OFF THE REST!
+    description=${exclude_set}_${ipdigmt_version}
+    
+    ## Once done, make tar-ball of required outputs
+    ## grab directory specified in above
+    cd `dirname ${reference_package_dir}`
+    temp=`basename ${reference_package_dir}`
+    temp2=`echo "${temp}/*.RData"`
+    tar -cvf QUILT_HLA_reference_package_${description}_${output_date}.tar ${temp2}
+    chmod 755 QUILT_HLA_reference_package_${description}_${output_date}.tar
+    mkdir -p /data/smew1/rdavies/quilt_hla_packages/
+    rsync -av QUILT_HLA_reference_package_${description}_${output_date}.tar ${SAVE_DIR}
+    cd "${QUILT_DIR}"
+
+    if [ ${i} == 1 ]
+    then
+	## also bams
+	## inputs_dir=`cat ${script} | grep inputs_dir= | sed 's/inputs_dir=//g'`
+	cd ${inputs_dir}
+	cat bamlist.txt | xargs -l basename > bamlist2.txt
+	mv bamlist2.txt bamlist.txt
+	tar -cvf QUILT_HLA_example_bams_${output_date}.tar *2.0X*bam* bamlist.txt
+v	chmod 755 QUILT_HLA_example_bams_${output_date}.tar
+	rsync -av QUILT_HLA_example_bams_${output_date}.tar ${SAVE_DIR}
+	cd "${QUILT_DIR}"	
+    fi
+
+    ## nuke directory that was being used
+    rm -f ${script}
+    rm -r -f "${inputs_dir}"
+    
+    
+done
+
+    
 
 
 
diff --git a/scripts/test-cli.R b/scripts/test-cli.R
index 63f4fcb..7d4dafd 100755
--- a/scripts/test-cli.R
+++ b/scripts/test-cli.R
@@ -76,7 +76,7 @@ cli_output_file <- "QUILT_HLA_prepare_reference.R"
 STITCH::make_STITCH_cli(
     function_file = "QUILT/R/quilt-hla-prepare-reference.R",
     cli_output_file = cli_output_file,
-    other_character_params = c("outputdir", "ipd_igmt_alignments_zip_file", "quilt_hla_supplementary_info_file", "all_hla_regions", "hla_regions_to_prepare", "reference_exclude_samplelist_file", "region_exclude_file", "hla_gene_region_file", "hla_types_panel"),
+    other_character_params = c("outputdir", "ipd_igmt_alignments_zip_file", "quilt_hla_supplementary_info_file", "all_hla_regions", "hla_regions_to_prepare", "reference_exclude_samplelist_file", "region_exclude_file", "hla_gene_region_file", "hla_types_panel", "refseq_table_file", "ref_fasta"),
     other_double_params = c("full_regionStart", "full_regionEnd", "full_buffer"),
     other_logical_params = c("reference_exclude_samples_for_initial_phasing"),
     character_vectors = c("all_hla_regions", "hla_regions_to_prepare"),