Create code_conversion.txt

STRIDES · Jan 16, 2025 · b98524a · b98524a
1 parent a223837
commit b98524a
Showing 1 changed file with 259 additions and 0 deletions.
diff --git a/docs/chatbot_comparison/code_conversion.txt b/docs/chatbot_comparison/code_conversion.txt
@@ -0,0 +1,259 @@
+Persona: You are a bioinformatician with no experience running nextflow workflows. You need to convert your code from a scripting language to an nf-core workflow. 
+
+Task: Convert this code to an nf-core workflow.
+
+Details:
+- Source Code: The code is written in BASH 
+- Target Workflow: The code needs to be converted into an nf-core workflow.
+- Input Format: The nextflow script should accept input files in a samplesheet format.
+
+Additional Requirements:
+- Ensure that the nf-core workflow adheres to best practices.
+- Include necessary modules and processes to replicate the functionality of the original script.
+- Provide clear comments and documentation within the script for ease of understanding and future modifications.
+- Ensure that the nextflow_schema.json file includes the script name and a description of the workflow.
+
+Desired Output:
+- A nf-core style set of scripts that replicates the above functionality. The necessary files include the main.nf file, a nextflow.config file, a nextflow_schema.json, and a modules.config file.
+- The script should accept input files in a samplesheet format.
+- Please use a publishDir directive anytime a directory must be created
+- The script should be able to run in a docker, singularity or conda environment. 
+
+Example of a samplesheet format:
+sample,fastq_1,fastq_2
+sample1,sample1_R1.fastq.gz,sample1_R2.fastq.gz
+sample2,sample2_R1.fastq.gz,sample2_R2.fastq.gz
+
+Example of a main.nf file: 
+```
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Define the workflow
+workflow {
+    // Define the input channel for samplesheet
+    Channel.fromPath(params.samplesheet)
+           .splitCsv(header: true)
+           .set { samplesheet }
+
+    // Map the samplesheet to tuples with sample IDs and FASTQ files
+    samplesheet
+        .map { row -> tuple(row.sample, row.fastq_1, row.fastq_2) }
+        .into { reads_ch }
+
+    // Step 1: Align reads
+    align_ch = ALIGN_READS(reads_ch)
+
+    // Step 2: Sort BAM files
+    sort_ch = SORT_BAM(align_ch)
+
+    // Step 3: Index BAM files
+    INDEX_BAM(sort_ch)
+}
+
+// Process to align reads to a reference genome
+process ALIGN_READS {
+    input:
+    tuple val(sample_id), file(fastq_1), file(fastq_2)
+
+    output:
+    tuple val(sample_id), file("${sample_id}.bam")
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hdfd78af_0' :
+        'biocontainers/bwa:0.7.17--hdfd78af_0' }"
+
+    script:
+    """
+    bwa mem ref_genome.fa $fastq_1 $fastq_2 > ${sample_id}.sam
+    samtools view -Sb ${sample_id}.sam > ${sample_id}.bam
+    """
+}
+
+// Process to sort BAM files
+process SORT_BAM {
+    input:
+    tuple val(sample_id), file(bam)
+
+    output:
+    tuple val(sample_id), file("${sample_id}.sorted.bam")
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.9--hdfd78af_0' :
+        'biocontainers/samtools:1.9--hdfd78af_0' }"
+
+    script:
+    """
+    samtools sort $bam -o ${sample_id}.sorted.bam
+    """
+}
+
+// Process to index sorted BAM files
+process INDEX_BAM {
+    input:
+    tuple val(sample_id), file(sorted_bam)
+
+    output:
+    tuple val(sample_id), file("${sample_id}.sorted.bam.bai")
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.9--hdfd78af_0' :
+        'biocontainers/samtools:1.9--hdfd78af_0' }"
+
+    script:
+    """
+    samtools index $sorted_bam
+    """
+}
+
+```
+
+Example of a nextflow.config file: 
+```
+params {
+    outdir = 'results'
+}
+
+process {
+    withName: 'ALIGN_READS' {
+        cpus = 4
+        memory = '8 GB'
+    }
+    withName: 'SORT_BAM' {
+        cpus = 2
+        memory = '4 GB'
+    }
+    withName: 'INDEX_BAM' {
+        cpus = 1
+        memory = '2 GB'
+    }
+}
+
+docker {
+    enabled = params.containerEngine == 'docker'
+    image = 'nfcore/rnaseq:latest'
+}
+
+singularity {
+    enabled = params.containerEngine == 'singularity'
+    autoMounts = true
+}
+
+conda {
+    enabled = params.containerEngine == 'conda'
+    cacheDir = './conda'
+}
+```
+Example of a nextflow_schema.json: 
+{
+    "title": "nf-core/rnaseq",
+    "description": "RNA-seq analysis pipeline",
+    "type": "object",
+    "properties": {
+        "outdir": {
+            "type": "string",
+            "description": "Output directory",
+            "default": "results"
+        }
+    },
+    "required": ["outdir"]
+}
+
+Example of a modules.config file: 
+```
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    publishDir = [
+        path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" },
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+    ]
+
+}
+
+module {
+    bwa {
+        version = '0.7.17'
+        source = 'bioconda'
+    }
+    samtools {
+        version = '1.9'
+        source = 'bioconda'
+    }
+}
+```
+Example of environment.yml file for conda environment creation: 
+```
+name: alignment
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bwa=0.7.17
+  - samtools=1.9
+```
+
+Script to convert: 
+bacass.sh
+```
+#!/bin/bash
+
+# Directory containing the input FASTQ files
+INPUT_DIR="/home/ramsivakumar/nextflow_conversion/fastq"
+OUTPUT_DIR="/home/ramsivakumar/nextflow_conversion/test_out_bash"
+
+# Create output directories for the tools
+mkdir -p "$OUTPUT_DIR/fastp_output" "$OUTPUT_DIR/fastqc_output" "$OUTPUT_DIR/unicycler_output"
+
+# Loop over the files in the input directory
+  fastq_file_r1="$INPUT_DIR/${sample_name}_R1_001.fastq.gz"
+  fastq_file_r2="$INPUT_DIR/${sample_name}_R2_001.fastq.gz"
+
+  # Check if the R2 file exists
+  if [[ ! -f "$fastq_file_r2" ]]; then
+    echo "Warning: Corresponding R2 file for $fastq_file_r1 not found. Skipping this pair."
+    continue
+  fi
+
+  # Step 1: Run fastp
+  fastp_output_r1="$OUTPUT_DIR/fastp_output/${sample_name}_R1.fastp.fastq"
+  fastp_output_r2="$OUTPUT_DIR/fastp_output/${sample_name}_R2.fastp.fastq"
+  fastp -i "$fastq_file_r1" -I "$fastq_file_r2" -o "$fastp_output_r1" -O "$fastp_output_r2"
+
+  # Step 2: Run fastqc on the fastp output
+  fastqc_output_dir="$OUTPUT_DIR/fastqc_output/${sample_name}_fastqc"
+  mkdir -p "$fastqc_output_dir"
+  fastqc "$fastp_output_r1" "$fastp_output_r2" -o "$fastqc_output_dir"
+
+  # Step 3: Run multiqc
+  multiqc_output_dir="$OUTPUT_DIR/multiqc_output"
+  mkdir -p "$multiqc_output_dir"
+  multiqc "$fastqc_output_dir" -o "$multiqc_output_dir"
+
+  # Step 4: Run unicycler on the fastp output
+  unicycler_output_dir="$OUTPUT_DIR/unicycler_output/${sample_name}_unicycler"
+  mkdir -p "$unicycler_output_dir"
+  unicycler -1 "$fastp_output_r1" -2 "$fastp_output_r2" -o "$unicycler_output_dir"
+
+  echo "Finished processing $sample_name."
+done
+
+echo "All files processed."
+```