Deploying to gh-pages from @ 8117d28 🚀

regulatory-genomics · Oct 22, 2024 · 61acb8e · 61acb8e
commit 61acb8e
Show file tree

Hide file tree

Showing 293 changed files with 86,517 additions and 0 deletions.
diff --git a/.buildinfo b/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: e0eb6db499003519cf5a02c49f3f533e
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.delete_read.doctree b/.doctrees/_autosummary/precellar.SeqSpec.delete_read.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.doctree b/.doctrees/_autosummary/precellar.SeqSpec.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.to_yaml.doctree b/.doctrees/_autosummary/precellar.SeqSpec.to_yaml.doctree
diff --git a/.doctrees/_autosummary/precellar.SeqSpec.update_read.doctree b/.doctrees/_autosummary/precellar.SeqSpec.update_read.doctree
diff --git a/.doctrees/_autosummary/precellar.align.doctree b/.doctrees/_autosummary/precellar.align.doctree
diff --git a/.doctrees/_autosummary/precellar.make_fragment.doctree b/.doctrees/_autosummary/precellar.make_fragment.doctree
diff --git a/.doctrees/_autosummary/precellar.make_genome_index.doctree b/.doctrees/_autosummary/precellar.make_genome_index.doctree
diff --git a/.doctrees/_autosummary/precellar.utils.strip_barcode_from_fastq.doctree b/.doctrees/_autosummary/precellar.utils.strip_barcode_from_fastq.doctree
diff --git a/.doctrees/api.doctree b/.doctrees/api.doctree
diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle
diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree
diff --git a/.doctrees/install.doctree b/.doctrees/install.doctree
diff --git a/.doctrees/nbsphinx/tutorials/generic.ipynb b/.doctrees/nbsphinx/tutorials/generic.ipynb
@@ -0,0 +1,246 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Processing barcoded Fastq files\n",
+    "\n",
+    "You would likely encounter barcoded fastq files when working with single cell ATAC-seq data.\n",
+    "As on early days of single cell ATAC-seq, cell barcodes are usually added to the read name of the fastq files.\n",
+    "This notebook demonstrates how to process these barcoded fastq files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import precellar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extracting cell barcodes from read names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "@CCAGCACAAGCCATCCTATCGT:A00953:155:HVCHLDRXX:1:1101:1036:1031 1:N:0:1\n",
+      "ANCTTGGATCATCAGGTTTGTCTGTAGCTGATTTATTTCTTTAAGTTTCCC\n",
+      "+\n",
+      "F#FFFFFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF\n",
+      "@TAACCACTACGAATGACTGACA:A00953:155:HVCHLDRXX:1:1101:1127:1031 1:N:0:1\n",
+      "TNCCAGGACCAGTGACCGTCACCCGCAGTAAGGATCGGGGCGGCTCCGCCA\n",
+      "+\n",
+      "F#:FFFFFFFFF:FFFFF:FF,F,FFFFFFFF,FFF:FFFF:FFFFFF,FF\n",
+      "@CGATATGTAGGGGACTAATTCC:A00953:155:HVCHLDRXX:1:1101:1145:1031 1:N:0:1\n",
+      "GNCGGATCACAAGGTCAGGAGTTCGAGACCTGGCTGGCCAACACGGTGAAA\n",
+      "\n",
+      "gzip: stdout: Broken pipe\n"
+     ]
+    }
+   ],
+   "source": [
+    "!zcat R1.fq.gz | head"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precellar.utils.strip_barcode_from_fastq(\n",
+    "    'R1.fq.gz',\n",
+    "    'R1_processed.fq.zst',\n",
+    "    out_barcode='I1.fq.zst',\n",
+    "    regex=\"^([ACTG]+):\",\n",
+    "    right_add=1,\n",
+    ")\n",
+    "\n",
+    "precellar.utils.strip_barcode_from_fastq(\n",
+    "    'R2.fq.gz',\n",
+    "    'R2_processed.fq.zst',\n",
+    "    regex=\"^([ACTG]+):\",\n",
+    "    right_add=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Starting download of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m Downloaded 2643 bytes\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:02Z \u001b[32mINFO \u001b[0m cached_path::cache\u001b[90m]\u001b[0m New version of https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml cached\n"
+     ]
+    }
+   ],
+   "source": [
+    "assay = precellar.SeqSpec(\"https://raw.githubusercontent.com/regulatory-genomics/precellar/refs/heads/main/seqspec_templates/generic_atac.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\n",
+       "└── atac(153-1150)\n",
+       "    ├── atac-illumina_p5(29)\n",
+       "    ├── atac-read1(34) [↓R1(1-98)]\n",
+       "    ├── gDNA(1-1000)\n",
+       "    ├── atac-read2(34) [↑R2(1-98), ↓I1(22)]\n",
+       "    ├── atac-cell_barcode(22)\n",
+       "    └── atac-illumina_p7(24)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "assay"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assay.update_read(\"R1\", fastq=\"R1_processed.fq.zst\")\n",
+    "assay.update_read(\"I1\", fastq=\"I1.fq.zst\")\n",
+    "assay.update_read(\"R2\", fastq=\"R2_processed.fq.zst\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\n",
+       "└── atac(153-1150)\n",
+       "    ├── atac-illumina_p5(29)\n",
+       "    ├── atac-read1(34) [↓R1(51)]\n",
+       "    ├── gDNA(1-1000)\n",
+       "    ├── atac-read2(34) [↑R2(51), ↓I1(22)]\n",
+       "    ├── atac-cell_barcode(22)\n",
+       "    └── atac-illumina_p7(24)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "assay"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Counting barcodes...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Found 2500 barcodes. 100.00% of them have an exact match in whitelist\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[32mINFO \u001b[0m precellar::align\u001b[90m]\u001b[0m Aligning reads...\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R1) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "\u001b[90m[\u001b[0m2024-10-01T15:18:10Z \u001b[33mWARN \u001b[0m seqspec\u001b[90m]\u001b[0m Reads (R2) may contain additional bases downstream of the variable-length region, e.g., adapter sequences.\n",
+      "100%|██████████| 2500/2500 [00:00<00:00, 15545.42it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "qc = precellar.align(\n",
+    "    assay, \"/data/kzhang/GRCh38/hg38.fa.gz\",\n",
+    "    modality=\"atac\",\n",
+    "    output_fragment=\"atac_fragments.tsv.zst\",\n",
+    "    num_threads=32,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'frac_q30_bases_read1': 0.8179764705882353,\n",
+       " 'frac_valid_barcode': 1.0,\n",
+       " 'sequenced_read_pairs': 2500.0,\n",
+       " 'frac_q30_bases_barcode': 1.0,\n",
+       " 'frac_unmapped': 0.07640000000000002,\n",
+       " 'sequenced_reads': 5000.0,\n",
+       " 'frac_fragment_flanking_single_nucleosome': 0.0029791459781529296,\n",
+       " 'frac_confidently_mapped': 0.8524,\n",
+       " 'frac_fragment_in_nucleosome_free_region': 0.010427010923535254,\n",
+       " 'frac_q30_bases_read2': 0.9442745098039216,\n",
+       " 'frac_nonnuclear': 0.0128,\n",
+       " 'frac_duplicates': 0.004940711462450593}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "qc"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}