Add option to estimate TPMs based on different transcript/gene length…

…s per sample. Changed tests from genes to transcripts. Add tests for the zpca functions. Update documentation. (#7)
zavolanlab · Oct 7, 2020 · 782dd65 · 782dd65
1 parent 44f34ee
commit 782dd65
Show file tree

Hide file tree

Showing 11 changed files with 533,024 additions and 67,956 deletions.
diff --git a/README.md b/README.md
@@ -31,14 +31,21 @@ Perform PCA based on an expression matrix (rows are genes/transcripts, columns a
 optional arguments:
   -h, --help            show this help message and exit
   --counts FILE         Counts table (tsv). The first column should contain the gene/transcript id. The other columns should contain the counts for each sample.
-  --lengths FILE        Table of feature lengths (tsv). The first column should contain the gene/transcript id. The second column should contain the corresponding lengths
+  --lengths FILE        Table of feature lengths (tsv). 
+                        The file can have two types of formats.
+                        First option: The first column should contain the gene/transcript id.
+                        The second column should contain the corresponding lengths
+                        Second option: The first column should contain the gene/transcript id.
+                        The rest of the columns should contain the gene/transcript lengths for each of the samples
+                        Note that the sample names should be the same the sample names of the counts.
   --pseudocount PSEUDOCOUNT
                         Pseudocount to add in the count table. Default: 1
   --tpm-filter TPM_FILTER
                         Filter genes/transcripts with mean expression less than the provided filter. Default: 0
   --filter-not-expressed
                         Filter not expressed genes/transcripts (0 counts for all samples).
   --out DIRECTORY       Output directory
+  -v, --verbose         Verbose
   ```
 
   # Docker 

diff --git a/scripts/zpca b/scripts/zpca
@@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D
 from argparse import ArgumentParser, RawTextHelpFormatter
 
-from zpca.zpca import counts2tpm
+from zpca.zpca import counts2tpm, counts2tpm_many
 
 def main():
     """ Main function """
@@ -35,7 +35,14 @@ def main():
     parser.add_argument(
         "--lengths",
         dest="lengths",
-        help="Table of feature lengths (tsv). The first column should contain the gene/transcript id. The second column should contain the corresponding lengths",
+        help="Table of feature lengths (tsv). " + os.linesep +
+             "The file can have two types of formats." + os.linesep +
+             "First option: The first column should contain the gene/transcript id." + os.linesep +
+             "The second column should contain the corresponding lengths" + os.linesep +
+             "Second option: The first column should contain the gene/transcript id." + os.linesep +
+             "The rest of the columns should contain the gene/transcript lengths for each of the samples" + os.linesep +
+             "Note that the sample names should be the same the sample names of the counts."
+             ,
         required=True,
         metavar="FILE"
     )
@@ -106,6 +113,10 @@ def main():
         sys.stdout.write(f"Reading: {options.lengths} {os.linesep}")
     lengths =  pd.read_csv(options.lengths, header=0, index_col=0, sep="\t")
 
+    mode_lengths = "one"
+    if int(lengths.shape[1])>2:
+        mode_lengths = "many"
+
     if options.filter_not_expressed:
         if options.verbose:
             sys.stdout.write(f"Filtering not expressed genes/transcripts {os.linesep}")
@@ -117,7 +128,10 @@ def main():
 
     if options.verbose:
         sys.stdout.write(f"Converting the data to TPM {os.linesep}")
-    tpm = counts2tpm(df, lengths)
+    if mode_lengths == "one":
+        tpm = counts2tpm(df, lengths)
+    elif mode_lengths == "many":
+        tpm = counts2tpm_many(df, lengths)
 
     if options.verbose:
         sys.stdout.write(f"Log2 transform the data {os.linesep}")
@@ -263,7 +277,6 @@ def main():
 # Call the Main function and catch Keyboard interrups
 # -----------------------------------------------------------------------------
 
-
 if __name__ == '__main__':
     try:
         main()

diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
 
 setup(
     name='zpca',
-    version='0.3',
+    version='0.4',
     description="PCA analysis for genes or transcripts.",
     author="Foivos Gypas",
     author_email='[email protected]',

diff --git a/test_data/gene_expression.tsv b/test_data/gene_expression.tsv