Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests/Versioning #6

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/bed_validation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: BED File Validation

# Defines when the action will run. Triggers on pull request and push events for the main branch.
on:
pull_request:
branches: [ main ]
push:
branches: [ main ]

# A workflow run is made up of one or more jobs that can run sequentially or in parallel.
jobs:
# This workflow contains a single job called "validate".
validate:
# The type of runner that the job will run on.
runs-on: ubuntu-latest

# Steps represent a sequence of tasks that will be executed as part of the job.
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it.
- uses: actions/checkout@v2

# Sets up a Python environment using the version 3.x (modify as needed).
- name: Set up Python 3.x
uses: actions/setup-python@v2
with:
python-version: '3.x'

# Runs the BED file validation script.
- name: Run BED File Validation Script
run: |
python ./test/check_beds.py
29 changes: 29 additions & 0 deletions .github/workflows/version-tagging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Automated Version Tagging

on:
push:
branches:
- main # or any other branch you want to automate this for

jobs:
version-tagging:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2
with:
fetch-depth: 0 # Ensures history is available for tag creation

- name: Bump version and push tag
uses: mathieudutour/[email protected]
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
default_bump: patch # Default bump type if no keyword is found
release_branches: main # Branches where releases are created
custom_tag: # Optional, use if you want a custom tag format
env:
# Configure commit message patterns for different bump types
MAJOR_PATTERN: 'MAJOR'
MINOR_PATTERN: 'MINOR'
PATCH_PATTERN: 'PATCH'
184 changes: 184 additions & 0 deletions test/check_beds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import sys
import os
import csv

GENE_WINDOW_SIZE_3 = 0
GENE_WINDOW_SIZE_5 = 0
dir = "./current"
sense = "+-"
ref_seq = "./test/merged_refseq.fasta"

def get_gene_type(label):
gene = label.split('*')[0]
if gene[3] == 'J' or gene[3] == 'V':
return gene[3]
elif gene[3] == 'D' and '-' in gene and '_' not in gene:
return 'D'
else:
return 'C'

def read_beds(sense, dir, ref_seq):
beds = {}
for entry in os.scandir(dir):
if entry.is_file() and '.bed' in entry.name:
with open(os.path.join(dir, entry.name), 'r') as fi:
el_type = entry.name.replace('.bed', '')
reader = csv.DictReader(fi, delimiter='\t', fieldnames=['chain', 'start', 'end', 'gene', 'sense'])
for row in reader:
if row['start'] and row['end']:
refname = row['chain']
row['el_type'] = el_type
row['start'] = int(row['start'])
row['end'] = int(row['end'])
row['seq'] = ref_seq[refname][row['start']:row['end']] if refname in ref_seq else ''
if refname not in beds:
beds[refname] = {}
if row['gene'] not in beds[refname]:
beds[refname][row['gene']] = {}
if row['el_type'] in beds[refname][row['gene']]:
beds[refname][row['gene']]['3_' + row['el_type']] = beds[refname][row['gene']][row['el_type']]
del beds[refname][row['gene']][row['el_type']]
beds[refname][row['gene']]['5_' + row['el_type']] = row
else:
beds[refname][row['gene']][row['el_type']] = row

# Sanity checks
if sense == '-':
g_s = 'start'
g_e = 'end'
else:
g_s = 'end'
g_e = 'start'

for refname in beds:
for gene in list(beds[refname]):
gene_type = get_gene_type(gene)
row = beds[refname][gene]
if gene_type == 'V':
complete = True
for el in ['exon_1', 'intron', 'exon_2', 'heptamer', 'spacer', 'nonamer']:
if el not in row:
sys.exit(f"element {el} missing from gene {gene} in file {refname}")
complete = False

if complete:
if sense == '+-':
if row['nonamer']['start'] > row['heptamer']['start']:
g_s = 'end'
g_e = 'start'
else:
g_s = 'start'
g_e = 'end'

if row['nonamer'][g_e] != row['spacer'][g_s]:
sys.exit(f"maths problem in {gene}: row['nonamer'][{g_e}] != row['spacer'][{g_s}]")
if row['spacer'][g_e] != row['heptamer'][g_s]:
sys.exit(f"maths problem in {gene}: row['spacer'][{g_e}] != row['heptamer'][{g_s}]")
if row['heptamer'][g_e] != row['exon_2'][g_s]:
sys.exit(f"maths problem in {gene}: row['heptamer'][{g_e}] != row['exon_2'][{g_s}]")
if row['exon_2'][g_e] != row['intron'][g_s]:
sys.exit(f"maths problem in {gene}: row['exon_2'][{g_e}] != row['intron'][{g_s}]")
if row['intron'][g_e] != row['exon_1'][g_s]:
sys.exit(f"maths problem in {gene}: row['intron'][{g_e}] != row['exon_1'][{g_s}]")

if 'GENE' in row:
row['GENE']['start'] = row['GENE']['start'] - GENE_WINDOW_SIZE_5
row['GENE']['end'] = row['GENE']['end'] + GENE_WINDOW_SIZE_3

row_sense = sense
if sense == '+-':
if row['nonamer']['start'] > row['heptamer']['start']:
row_sense = '+'
else:
row_sense = '-'

if row_sense == '-':
if 'V-REGION' not in row:
row['V-REGION'] = {}
row['V-REGION']['start'] = row['exon_2']['start']
row['V-REGION']['end'] = row['exon_2']['end'] - 11
if 'L-PART2' not in row:
row['L-PART2'] = {}
row['L-PART2']['start'] = row['exon_2']['end'] - 11
row['L-PART2']['end'] = row['exon_2']['end']
else:
if 'V-REGION' not in row:
row['V-REGION'] = {}
row['V-REGION']['start'] = row['exon_2']['start'] + 11
row['V-REGION']['end'] = row['exon_2']['end']
if 'L-PART2' not in row:
row['L-PART2'] = {}
row['L-PART2']['start'] = row['exon_2']['start']
row['L-PART2']['end'] = row['exon_2']['start'] + 11

elif gene_type == 'J':
for el in ['heptamer', 'spacer', 'nonamer']:
complete = True
if el not in row:
print(f'element {el} missing from row {row}')
complete = False

if complete:
if sense == '+-':
if row['nonamer']['start'] < row['heptamer']['start']:
g_s = 'end'
g_e = 'start'
else:
g_s = 'start'
g_e = 'end'

if row['heptamer'][g_e] != row['spacer'][g_s]:
print(f"maths problem in {gene}: row['heptamer'][{g_e}] != row['spacer'][{g_s}]")
if row['spacer'][g_e] != row['nonamer'][g_s]:
print(f"maths problem in {gene}: row['spacer'][{g_e}] != row['nonamer'][{g_s}]")

row['GENE'][g_e] -= GENE_WINDOW_SIZE_5
row['GENE'][g_s] += GENE_WINDOW_SIZE_3

elif gene_type == 'D':
for el in ['3_heptamer', '3_spacer', '3_nonamer', '5_heptamer', '5_spacer', '5_nonamer']:
complete = True
if el not in row:
print(f'element {el} missing from row {row}')
complete = False

if complete:
if row['3_nonamer'][g_e] != row['3_spacer'][g_s]:
print(f"maths problem in {gene}: row['3_nonamer'][{g_e}] != row['3_spacer'][{g_s}]")
if row['3_spacer'][g_e] != row['3_heptamer'][g_s]:
print(f"maths problem in {gene}: row['3_spacer'][{g_e}] != row['3_heptamer'][{g_s}]")
if row['5_heptamer'][g_e] != row['5_spacer'][g_s]:
print(f"maths problem in {gene}: row['5_heptamer'][{g_e}] != row['5_spacer'][{g_s}]")
if row['5_spacer'][g_e] != row['5_nonamer'][g_s]:
print(f"maths problem in {gene}: row['5_spacer'][{g_e}] != row['5_nonamer'][{g_s}]")

row['GENE'][g_s] -= GENE_WINDOW_SIZE_5
row['GENE'][g_e] += GENE_WINDOW_SIZE_3
return beds

def write_beds(beds, dir):
bed_files = {}
for refname in beds:
for gene in beds[refname]:
for el_type in beds[refname][gene]:
row = beds[refname][gene][el_type]
if el_type not in bed_files:
bed_files[el_type] = {}
if refname not in bed_files[el_type]:
bed_files[el_type][refname] = []
if 'el_type' in row:
del row['el_type']
if 'seq' in row:
del row['seq']
bed_files[el_type][refname].append(row)

for el_type in bed_files:
with open(os.path.join(dir, f'{el_type}.bed'), 'w', newline='') as fo:
writer = csv.DictWriter(fo, delimiter='\t', fieldnames=['chain', 'start', 'end', 'gene', 'sense'])
for refname in bed_files[el_type]:
rows = bed_files[el_type][refname]
rows.sort(key=lambda x: x['start'])
writer.writerows(rows)

if __name__ == "__main__":
read_beds(sense, dir, ref_seq)
Loading
Loading