diff --git a/.github/actions/action.yml b/.github/actions/action.yml deleted file mode 100644 index ac2f87f..0000000 --- a/.github/actions/action.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: 'Test-bioinformatics-transform-BAM' -description: 'This action tests the transformBAM.sh' - -inputs: - file_in: - description: 'Input BAM file' - required: true - default: 'test-data/Sample_PM1_DNA75_1_P_1.md.bam' - dir_out: - description: 'Output directory' - required: true - default: 'test-data/output' - pm_in: - description: 'ID to convert from' - required: true - default: 'PM1' - pm_out: - description: 'Final output ID' - required: true - default: 'TEST1' - -runs: - using: 'docker' - image: 'bioinfo-test' - entrypoint: '/entrypoint.sh' \ No newline at end of file diff --git a/.github/actions/entrypoint.sh b/.github/actions/entrypoint.sh deleted file mode 100755 index 9fb6895..0000000 --- a/.github/actions/entrypoint.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh -echo "Program: $0" -echo "Parameter number: $#" -for var in "$@" -do - echo "Param: $var" -done -echo "From Environmental variables" -echo "File In: $INPUT_FILE_IN" -echo "Dir Out: $INPUT_DIR_OUT" -echo "PM_IN : $INPUT_PM_IN" -echo "PM_OUT : $INPUT_PM_OUT" - -if [ -e /usr/local/bin/transformBAM.sh ];then - echo "Transforming the data" - ls -la /usr/local/bin/transformBAM.sh - transformBAM.sh --file-in "$INPUT_FILE_IN" --dir-out "$INPUT_DIR_OUT" --pm-in "$INPUT_PM_IN" --pm-out "$INPUT_PM_OUT" -fi \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index ce3c0a6..f920518 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -14,7 +14,6 @@ on: pull_request: env: - # TODO: Change variable to your image's name. IMAGE_NAME: bioinformatics jobs: @@ -27,61 +26,137 @@ jobs: if: github.event_name == 'push' steps: - - uses: actions/checkout@v2 - - - name: Build image - run: docker build . --file Dockerfile --tag $IMAGE_NAME - - - name: Log into GitHub Container Registry - run: echo "${{ secrets.GHCR_TOKEN }}" | docker login https://ghcr.io -u ${{ secrets.GHCR_SVC_ACCOUNT }} --password-stdin - - - name: Push image to GitHub Container Registry + - uses: actions/checkout@v4 + with: + lfs: true + - name: Checkout LFS objects + run: git lfs checkout + - name: Set Date + run: echo "TODAYS_DATE=$(date +'%Y%m%d')" >> $GITHUB_ENV + + - name: Prepare tags + id: prep run: | - IMAGE_ID=ghcr.io/${{ github.repository_owner }}/$IMAGE_NAME - # Change all uppercase to lowercase - IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') - - # Strip git ref prefix from version - VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') - - # Strip "v" prefix from tag name - [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//') - - # Use Docker `latest` tag convention - [ "$VERSION" == "main" ] && VERSION=latest - - echo IMAGE_ID=$IMAGE_ID - echo VERSION=$VERSION + # Docker Registries + GHCR_IMAGE_NAME="ghcr.io/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}" + DOCKER_HUB_IMAGE_NAME="${{ github.repository_owner }}/${{ env.IMAGE_NAME }}" + + TAGS="" + FINAL_TAGS="" + + # Identify Tags + # Type of tags: + # 1. Scenario 1 - x.y.z.patchedYYYYMMDD + # 2. Scenario 2 - Semver Tags x.y.z + # 3. Scenario 3 - Main Branch + # 4. Scenario 4 - Branches + + if [[ $GITHUB_REF == refs/tags/* ]]; then + # If this is a tag push, use the tag name (stripping 'refs/tags/') + FULL_TAG=${GITHUB_REF#refs/tags/} + + # Remove leading 'v' from tag if present + if [[ $FULL_TAG == v* ]]; then + FULL_TAG=${FULL_TAG#v} + fi + + # Scenario 1 - x.y.z.patchedYYYYMMDD + # Tags + # 1. x.y.z.patchedYYYYMMDD + # 2. x.y.z + # 3. x.y + # 4. x + if [[ $FULL_TAG =~ (.*)\.patched[0-9]+ ]]; then + # Extract the base tag without the .patched segment + BASE_TAG=${BASH_REMATCH[1]} + + if [[ $BASE_TAG =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + MAJOR=${BASH_REMATCH[1]} + MINOR=${BASH_REMATCH[2]} + PATCH=${BASH_REMATCH[3]} + + TAGS="$FULL_TAG,$BASE_TAG,$MAJOR.$MINOR.$PATCH,$MAJOR.$MINOR,$MAJOR" + fi + else + # Scenario 2 - Semver Tags x.y.z + # Tags + # 1. x.y.z + # 2. x.y + # 3. x + if [[ $FULL_TAG =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + MAJOR=${BASH_REMATCH[1]} + MINOR=${BASH_REMATCH[2]} + PATCH=${BASH_REMATCH[3]} + + TAGS="$MAJOR.$MINOR.$PATCH,$MAJOR.$MINOR,$MAJOR" + fi + fi + elif [[ $GITHUB_REF == refs/heads/main ]]; then + # Scenario 3 - Main Branch + # If this is a push to main, use 'latest' + TAGS="latest" + else + # Scenario 4 - Branches + # Otherwise, use the branch name, replacing non-alphanumeric characters with underscores + TAG=$(echo ${GITHUB_REF#refs/heads/} | sed 's/[^a-zA-Z0-9]/_/g') + TAGS=$TAG + fi + + # For each tag, prepend the registry name and output a newline-separated list + for tag in $(echo "$TAGS" | tr ',' '\n'); do + FINAL_TAGS="$FINAL_TAGS $GHCR_IMAGE_NAME:$tag," + FINAL_TAGS="$FINAL_TAGS $DOCKER_HUB_IMAGE_NAME:$tag," + done + + FINAL_TAGS=${FINAL_TAGS%,} + + echo "tags=${FINAL_TAGS}" >> $GITHUB_OUTPUT + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + # list of Docker images to use as base name for tags + images: | + ghcr.io/${{ github.repository_owner }}/${{ env.IMAGE_NAME }} + ${{ github.repository_owner }}/${{ env.IMAGE_NAME }} + labels: + org.opencontainers.image.title=${{ env.IMAGE_NAME }} + org.opencontainers.image.description=Bioinformatics Tools + org.opencontainers.image.vendor=Englander Institute for Precision Medicine + maintainer=Andrea Sboner + org.opencontainers.image.authors=Andrea Sboner + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 - docker tag $IMAGE_NAME $IMAGE_ID:$VERSION - docker push $IMAGE_ID:$VERSION + - name: Log into GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ secrets.GHCR_SVC_ACCOUNT }} + password: ${{ secrets.GHCR_TOKEN }} - name: Login to EIPM DockerHub - uses: docker/login-action@v1 + uses: docker/login-action@v3 with: username: ${{ secrets.EIPM_DOCKER_HUB_USERNAME }} password: ${{ secrets.EIPM_DOCKER_HUB_TOKEN }} - - - name: Push image to EIPM Docker Hub - run: | - IMAGE_ID=eipm/$IMAGE_NAME - - # Change all uppercase to lowercase - IMAGE_ID=$(echo $IMAGE_ID | tr '[A-Z]' '[a-z]') - - # Strip git ref prefix from version - VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,') - # Strip "v" prefix from tag name - [[ "${{ github.ref }}" == "refs/tags/"* ]] && VERSION=$(echo $VERSION | sed -e 's/^v//') - - # Use Docker `latest` tag convention - [ "$VERSION" == "main" ] && VERSION=latest - - echo IMAGE_ID=$IMAGE_ID - echo VERSION=$VERSION - - docker tag $IMAGE_NAME $IMAGE_ID:$VERSION - docker push $IMAGE_ID:$VERSION - \ No newline at end of file + - name: Build and push to Docker Hub and GitHub Container Registry + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64 + file: ./Dockerfile + sbom: true + push: true + provenance: mode=max + tags: ${{ steps.prep.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=registry,ref=ghcr.io/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:cache + cache-to: type=registry,ref=ghcr.io/${{ github.repository_owner }}/${{ env.IMAGE_NAME }}:cache,mode=max + \ No newline at end of file diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100755 index b82f013..0000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,27 +0,0 @@ -# Controls when the action will run. Triggers the workflow on push or pull request -# events but only for the main branch -on: -# push: -# branches: [ main ] - pull_request: - branches: [ main ] - -jobs: - # This workflow contains a single jobs called "build" and "test" - build: - runs-on: ubuntu-latest - name: 'Build and test the bioinformatics image' - steps: - # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - - name: Checkout - uses: actions/checkout@v2 - # Builds the image and names it bioinfo-test - - name: Build the image - id: build - run: docker build --rm -t bioinfo-test -f Dockerfile . - # run: docker build --rm -t bioinfo-test -f Dockerfile-test . - # Tests transformBAM.sh script with the test data - - name: Test transformBAM - id: bioinformatics-test - # run: docker run --rm --name bioinfo-test-github bioinfo-test /bin/bash -c "transformBAM.sh --file-in /test-data/Sample_PM1_DNA75_1_P_1_md.bam --dir-out /test-data/output --pm-in PM1 --pm-out TEST1" - uses: ./.github/actions/ diff --git a/.github/workflows/monthly-patching.yml b/.github/workflows/monthly-patching.yml new file mode 100644 index 0000000..f83a7dd --- /dev/null +++ b/.github/workflows/monthly-patching.yml @@ -0,0 +1,48 @@ +name: Monthly Patching + +on: + schedule: + # Runs at 00:00 on the first day of every month + - cron: '30 6 29 * *' + +jobs: + tag: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{secrets.GHCR_TOKEN}} + + - name: Set up Git + run: | + git config --global user.name "eipmgithubsvc1" + git config --global user.email "eipmgithubsvc1@gmail.com" + + - name: Get the latest release + id: latest-release + uses: actions/github-script@v7 + with: + script: | + const release = await github.rest.repos.getLatestRelease({ + owner: context.repo.owner, + repo: context.repo.repo + }); + return release.data.tag_name; + + - name: Generate new tag + run: | + ORIGINAL_TAG=${{ steps.latest-release.outputs.result }} + if [[ $ORIGINAL_TAG == *".patched"* ]]; then + BASE_TAG=${ORIGINAL_TAG%%.patched*} + else + BASE_TAG=$ORIGINAL_TAG + fi + DATE=$(date +'%Y%m%d') + echo "NEW_TAG=$BASE_TAG.patched$DATE" >> $GITHUB_ENV + + - name: Create and push tag + run: | + git tag $NEW_TAG + git push origin $NEW_TAG diff --git a/Dockerfile b/Dockerfile index 38a6611..ebc793c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,8 +29,8 @@ RUN apt-get update \ #===========================# # Install SAMTOOLS & HTSLIB # #===========================# -ENV SAMTOOLS_VERSION 1.9 -ENV HTSLIB_VERSION 1.9 +ENV SAMTOOLS_VERSION 1.19 +ENV HTSLIB_VERSION 1.19 ENV samtools_dir /${PROGRAMS}/samtools-${SAMTOOLS_VERSION} ENV htslib_dir ${samtools_dir}/htslib-${HTSLIB_VERSION} RUN wget -O samtools-${SAMTOOLS_VERSION}.tar.bz2 https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2 \ @@ -75,8 +75,8 @@ ENV APP_NAME="bioinformatics" \ TZ='US/Eastern' \ PROGRAMS="opt" \ STAR_VERSION='2.7.6a' \ - SAMTOOLS_VERSION='1.9' \ - HTSLIB_VERSION='1.9' + SAMTOOLS_VERSION='1.19' \ + HTSLIB_VERSION='1.19' RUN apt-get update \ && apt-get upgrade -y --fix-missing \ @@ -101,25 +101,3 @@ RUN Rscript /R/scripts/installPackages.R ### Add utilities file COPY combine_pindel_vcfs.sh ${PROGRAMS} -COPY transformBAM.sh /usr/local/bin/ -RUN chmod ugo+x /usr/local/bin/transformBAM.sh - -### Add test data -COPY test-data/ /test-data/ - -### ADD entrypoint data -COPY .github/actions/entrypoint.sh / - -#===========================# -# Security Updates # -#===========================# -# ENV GHOSTSCRIPT_VER 9.52 -# ENV GS_VER 952 -# ENV GHOSTSCRIPT_DIR /${PROGRAMS}/ghostscript-${GHOSTSCRIPT_VER} -# RUN wget -O ghostscript-${GHOSTSCRIPT_VER}.tar.gz https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs952/ghostscript-9.52.tar.gz \ -# && tar -vxzf ghostscript-${GHOSTSCRIPT_VER}.tar.gz -C ${PROGRAMS} \ -# && rm ghostscript-${GHOSTSCRIPT_VER}.tar.gz \ -# && cd ${GHOSTSCRIPT_DIR} \ -# && ./configure \ -# && make \ -# && make install diff --git a/Dockerfile-test b/Dockerfile-test deleted file mode 100644 index da9736d..0000000 --- a/Dockerfile-test +++ /dev/null @@ -1,3 +0,0 @@ -FROM alpine -CMD "echo 'Alpine'" -COPY .github/actions/entrypoint.sh / \ No newline at end of file diff --git a/README.md b/README.md index 9b31ebb..1080b30 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This application provides some of the basic bioinformatics tools for development, debugging, and troubleshooting applications. -[![Actions Status](https://github.com/eipm/bioinformatics/workflows/Docker/badge.svg)](https://github.com/eipm/bioinformatics/actions) [![Github](https://img.shields.io/badge/github-1.3.1-green?style=flat&logo=github)](https://github.com/eipm/bioinformatics) [![EIPM Docker Hub](https://img.shields.io/badge/EIPM%20docker%20hub-1.3.1-blue?style=flat&logo=docker)](https://hub.docker.com/repository/docker/eipm/bioinformatics) [![GitHub Container Registry](https://img.shields.io/badge/GitHub%20Container%20Registry-1.3.1-blue?style=flat&logo=docker)](https://github.com/orgs/eipm/packages/container/package/bioinformatics) +[![Actions Status](https://github.com/eipm/bioinformatics/workflows/Docker/badge.svg)](https://github.com/eipm/bioinformatics/actions) [![Github](https://img.shields.io/badge/github-latest-green?style=flat&logo=github)](https://github.com/eipm/bioinformatics) [![EIPM Docker Hub](https://img.shields.io/badge/EIPM%20docker%20hub-latest-blue?style=flat&logo=docker)](https://hub.docker.com/repository/docker/eipm/bioinformatics) [![GitHub Container Registry](https://img.shields.io/badge/GitHub%20Container%20Registry-latest-blue?style=flat&logo=docker)](https://github.com/orgs/eipm/packages/container/package/bioinformatics) It includes: @@ -11,12 +11,10 @@ It includes: * bcftools (installed from distro with apt install: current 1.13 (using htslib 1.13+ds) * vcftools (installed from distro with apt install: current 0.1.16) * bwa (installed from distro with apt install: current 0.7.17-r1188) -* samtools (1.9 (using htslib 1.9)) +* samtools (1.19 (using htslib 1.19)) * pindel (latest:0.2.5b9, 20160729) * STAR (2.7.6a) -The installed tools are the latest available as of Dec 5, 2022. - **Note**: *pindel* includes a fix from a non-merged branch (see [Dockerfile](./Dockerfile)). Common R packages are also installed. See `installPackages.R` to see which ones. @@ -47,9 +45,5 @@ docker run --rm -it --name bioinfo -u $(id -u):$(id -g) -v /path/to/local/folder **Tip**: use the `:ro` option to mount read-only folders, e.g. `-v /path/to/local/folder/:/path/to/internal/folder:ro` -### Transform BAM - -This image also include an utility called `transformBAM.sh` that will transform a BAM file by replacing IDs in the header. It will warn if changes need to occur in the content of the BAM file, e.g. the RG group. To use it, simply type `transformBAM.sh` and the usage will be displayed. - ### Combine pindel VCFs -This utility `combine_pindel_vcfs.sh` takes multiple pindel results and merge them [TBDs] \ No newline at end of file +This utility `combine_pindel_vcfs.sh` takes multiple pindel results and merge them [TBDs] diff --git a/checkFileSize.R b/checkFileSize.R deleted file mode 100644 index 69d0407..0000000 --- a/checkFileSize.R +++ /dev/null @@ -1,31 +0,0 @@ -args = commandArgs(trailingOnly=TRUE) -# test if there is are 2 arguments: if not, return an error and the actual usage -if (length(args)!=2) { - stop("Usage: Rscript checkFileSize.R .\nThe json file should list all the info of the bucket: aws s3api list-objects-v2 --bucket --prefix - --endpoint-url > ", call.=FALSE) -} else if (length(args)==2) { - # default output file - print(args[1]) - INPUT_DIR=args[1] - if ( ! dir.exists(INPUT_DIR) ) stop("Directory doesn't exists: ", INPUT_DIR) - JSON_LIST=args[2] - if ( ! file.exists(JSON_LIST) ) stop("JSON file doesn't exists: ", JSON_LIST) -} - -library(jsonlite) - -fileSz = fromJSON(JSON_LIST) -fileSz$Contents$fileName = basename(fileSz$Contents$Key) -print("------- START -----") -for ( f in list.files(INPUT_DIR, include.dirs = FALSE, recursive=TRUE, all.files=FALSE, full.name=TRUE ) ) { - # print(sprintf("Disk file: %s", f)) - sz.disk = file.info(f)$size; - sz.aws=fileSz$Contents[ fileSz$Contents$fileName==basename(f),"Size"]; - if( !is.null(sz.aws) ) { - if( sz.aws != sz.disk ) stop("FAIL: Different size: ", f) - print(sprintf("OK: Disk: %.0f -- AWS: %.0f -- file: %s", sz.disk, sz.aws, basename(f))); - } else { - print("###### Couldn't find ", basename(f)) - } -} -print("------- END -----") diff --git a/test-data/Sample_PM1_DNA75_1_P_1.md.bam b/test-data/Sample_PM1_DNA75_1_P_1.md.bam deleted file mode 100644 index bde16cf..0000000 Binary files a/test-data/Sample_PM1_DNA75_1_P_1.md.bam and /dev/null differ diff --git a/test-data/Sample_PM1_DNA75_1_P_1.md.bam.bai b/test-data/Sample_PM1_DNA75_1_P_1.md.bam.bai deleted file mode 100644 index 314bbaa..0000000 Binary files a/test-data/Sample_PM1_DNA75_1_P_1.md.bam.bai and /dev/null differ diff --git a/transformBAM.sh b/transformBAM.sh deleted file mode 100755 index 21d9e0e..0000000 --- a/transformBAM.sh +++ /dev/null @@ -1,143 +0,0 @@ -#!/bin/bash -if [[ "$VERBOSE" = U ]];then - set -x -fi - -PROGRAM=$0 - -usage() { - echo -e "Usage:\tbash $PROGRAM [-i|--file-in] [-o|--dir-out] -p|--pm-in] [-r|--pm-out] [-f|--file-out]" -} -cleanUp() { - if [[ -e BAM.new.header.sam.txt ]];then - rm BAM.new.header.sam.txt - fi - exit $1 -} -logMsg() { - if [[ $# -lt 2 ]];then - printf "\nUsage: logMsg 'message_type' msg\nwhere 'message_type is one of 'INFO', 'WARN', 'DEBUG', or 'ERROR'.\n" - cleanUp -1 - fi - printf "[%s]:\t[%s]\t%s\n" "$(date)" "$1" "$2" - if [[ "$1" == ERROR ]];then - cleanUp 9 - fi -} - -if [[ $# -lt 8 ]];then - usage - cleanUp 127 -fi - -while [ "$1" != "" ]; do - case $1 in - -i | --file-in ) shift - FILE_IN=$1 - ;; - -o | --dir-out ) shift - DIR_OUT=$1 - ;; - -p | --pm-in ) shift - PM_IN=$1 - ;; - -r | --pm-out ) shift - PM_OUT=$1 - ;; - -f | --file-out ) shift - FILE_NAME_OUT=$1 - ;; - * ) usage - cleanUp 0 - esac - shift -done - -logMsg "INFO" "--------------------- START Transformation -----------------------" -if [[ ! -e "$FILE_IN" ]];then - logMsg "ERROR" "Input file does not exists: ($FILE_IN)" -fi -DIR_IN="$(pwd)" -logMsg "INFO" "Calculating number of entries in original BAM." -originalSize=$(samtools view -c -@ 8 "$DIR_IN/$FILE_IN") -if [[ ! -e "$DIR_OUT" ]];then - logMsg "WARN" "Output folder does not exist: ($DIR_OUT).\nCreating it now." - mkdir -p $DIR_OUT -fi -logMsg "DEBUG" "Directory Out: ($DIR_OUT)" -[[ -z "$FILE_NAME_OUT" ]] && FILE_OUT=$(basename $FILE_IN | sed "s%$PM_IN%$PM_OUT%g") || FILE_OUT="$FILE_NAME_OUT" -logMsg "DEBUG" "File Name Out: ($FILE_OUT)" - -cd "$DIR_OUT" -logMsg "INFO" "Making sure it's in samtools v10 format" -samtools view -H "$DIR_IN/$FILE_IN" -@ 8 | sed "s%FCID:%FC:%g" | sed "s%BCID:%BC:%g" | sed "s%LNID:%LN:%g" > BAM.v10.header.txt || logMsg "ERROR" "Something wrong with generating the v10 header." -samtools reheader -P BAM.v10.header.txt "$DIR_IN/$FILE_IN" > sample.v10.bam || logMsg "ERROR" "Something wrong with v10 header transformation." -rm BAM.v10.header.txt -logMsg "INFO" "Checking that the v10 version is equivalent to the original BAM" -newSize=$(samtools view -c -@ 8 sample.v10.bam) -if [[ ! $originalSize -eq $newSize ]];then - logMsg "ERROR" "The v10 version is different from the original BAM file." -fi -logMsg "INFO" "Splitting the BAM by @RGs" -samtools split -@ 8 -u sample.v10.noRG.bam:sample.v10.bam -f "sample.v10_%#.%." sample.v10.bam || logMsg "ERROR" "Splitting didn't work" -rm sample.v10.bam -if [[ $(samtools view -c -@ 8 sample.v10.noRG.bam ) == 0 ]];then - logMsg "INFO" "All reads have a valid RG" - rm sample.v10.noRG.bam -else - logMsg "WARN" "Some reads don't have an RG" -fi -for currentBAM in sample.v10_*.bam -do - logMsg "DEBUG" "Processing ($currentBAM)" - logMsg "INFO" "Creating new header with the new IDs" - samtools view -H "$currentBAM" -@ 8 | sed "s%$PM_IN%$PM_OUT%g" > currentBAM.new.header.txt || logMsg "ERROR" "Something wrong with generating the sanitized header." - - logMsg "INFO" "Checking new header for old IDs" - if [[ $(grep -c "$PM_IN" currentBAM.new.header.txt) -gt 0 ]];then - logMsg "ERROR" "Found IDs in the new header for $currentBAM" - fi - - logMsg "INFO" "Getting the new RG_ID from the new header" - RG_ID=$(grep "^@RG" currentBAM.new.header.txt | awk '{print $2}' | sed -r 's%^ID:(.*)$%\1%') - - logMsg "DEBUG" "New RG_ID: ($RG_ID)" - - logMsg "INFO" "Assigning the new header" - samtools reheader -P currentBAM.new.header.txt "$currentBAM" > tmp.bam || logMsg "ERROR" "Reheader had some issues" - rm "$currentBAM" # to save space - logMsg "INFO" "Replacing RGs" - samtools addreplacerg -@ 8 -R $RG_ID -o "$currentBAM".rg.bam tmp.bam || logMsg "ERROR" "Something wrong with replacing RG for $RG_ID" - rm tmp.bam currentBAM.new.header.txt - logMsg "INFO" "Checking BAM file for old IDs" - if [[ $(samtools view -@ 8 "$currentBAM".rg.bam | grep -c $PM_IN) -gt 0 ]];then - logMsg "ERROR" "Found old IDs in BAM file. Need to change it. See an example: $(samtools view -@ 8 "$currentBAM".rg.bam | grep $PM_IN | head -n 2)" - else - logMsg "INFO" "No old IDs found in BAM file." - fi -done -logMsg "INFO" "Creating the list of BAM files to merge" -logMsg "DEBUG" "Current directory: $(pwd)" -ls -1 sample.v10_*.bam.rg.bam > bam_files_to_merge || logMsg "ERROR" "Cannot create a list of BAM files" -samtools merge -b bam_files_to_merge -c -p -@ 8 $FILE_OUT || logMsg "ERROR" "Merging didn't work" -logMsg "DEBUG" "Removing temporary files" -cat bam_files_to_merge | xargs rm || logMsg "ERROR" "Cannot remove temporary BAM files sample.v10*" -rm bam_files_to_merge|| logMsg "ERROR" "Cannot remove bam_files_to_merge" - -logMsg "INFO" "Checking the final output for PM IDs" -if [[ $(samtools view -H -@ 8 $FILE_OUT | grep -c $PM_IN) -gt 0 ]];then - logMsg "ERROR" "Found old IDs in the header" -fi -if [[ $(samtools view -@ 8 $FILE_OUT | grep -c $PM_IN) -gt 0 ]];then - logMsg "ERROR" "Found old IDs in the BAM file" -fi -logMsg "INFO" "No old IDs found in the header or the BAM file" -logMsg "INFO" "Indexing the file" -samtools index -@ 8 $FILE_OUT || logMsg "ERROR" "Indexing failed $FILE_OUT" -logMsg "DEBUG" "Checking that all reads are included from the original BAM file" -newSize=$(samtools view -c -@ 8 $FILE_OUT) -if [[ ! $originalSize -eq $newSize ]];then - logMsg "ERROR" "The new final version is different from the original BAM file." -fi -logMsg "INFO" "-------- END Transformation ----" -cleanUp 0 \ No newline at end of file