Skip to content

Commit

Permalink
Add option to estimate TPMs based on different transcript/gene length…
Browse files Browse the repository at this point in the history
…s per sample. Changed tests from genes to transcripts. Add tests for the zpca functions. Update documentation. (#7)
  • Loading branch information
fgypas authored Oct 7, 2020
1 parent 44f34ee commit 782dd65
Show file tree
Hide file tree
Showing 11 changed files with 533,024 additions and 67,956 deletions.
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,21 @@ Perform PCA based on an expression matrix (rows are genes/transcripts, columns a
optional arguments:
-h, --help show this help message and exit
--counts FILE Counts table (tsv). The first column should contain the gene/transcript id. The other columns should contain the counts for each sample.
--lengths FILE Table of feature lengths (tsv). The first column should contain the gene/transcript id. The second column should contain the corresponding lengths
--lengths FILE Table of feature lengths (tsv).
The file can have two types of formats.
First option: The first column should contain the gene/transcript id.
The second column should contain the corresponding lengths
Second option: The first column should contain the gene/transcript id.
The rest of the columns should contain the gene/transcript lengths for each of the samples
Note that the sample names should be the same the sample names of the counts.
--pseudocount PSEUDOCOUNT
Pseudocount to add in the count table. Default: 1
--tpm-filter TPM_FILTER
Filter genes/transcripts with mean expression less than the provided filter. Default: 0
--filter-not-expressed
Filter not expressed genes/transcripts (0 counts for all samples).
--out DIRECTORY Output directory
-v, --verbose Verbose
```

# Docker
Expand Down
21 changes: 17 additions & 4 deletions scripts/zpca
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from argparse import ArgumentParser, RawTextHelpFormatter

from zpca.zpca import counts2tpm
from zpca.zpca import counts2tpm, counts2tpm_many

def main():
""" Main function """
Expand All @@ -35,7 +35,14 @@ def main():
parser.add_argument(
"--lengths",
dest="lengths",
help="Table of feature lengths (tsv). The first column should contain the gene/transcript id. The second column should contain the corresponding lengths",
help="Table of feature lengths (tsv). " + os.linesep +
"The file can have two types of formats." + os.linesep +
"First option: The first column should contain the gene/transcript id." + os.linesep +
"The second column should contain the corresponding lengths" + os.linesep +
"Second option: The first column should contain the gene/transcript id." + os.linesep +
"The rest of the columns should contain the gene/transcript lengths for each of the samples" + os.linesep +
"Note that the sample names should be the same the sample names of the counts."
,
required=True,
metavar="FILE"
)
Expand Down Expand Up @@ -106,6 +113,10 @@ def main():
sys.stdout.write(f"Reading: {options.lengths} {os.linesep}")
lengths = pd.read_csv(options.lengths, header=0, index_col=0, sep="\t")

mode_lengths = "one"
if int(lengths.shape[1])>2:
mode_lengths = "many"

if options.filter_not_expressed:
if options.verbose:
sys.stdout.write(f"Filtering not expressed genes/transcripts {os.linesep}")
Expand All @@ -117,7 +128,10 @@ def main():

if options.verbose:
sys.stdout.write(f"Converting the data to TPM {os.linesep}")
tpm = counts2tpm(df, lengths)
if mode_lengths == "one":
tpm = counts2tpm(df, lengths)
elif mode_lengths == "many":
tpm = counts2tpm_many(df, lengths)

if options.verbose:
sys.stdout.write(f"Log2 transform the data {os.linesep}")
Expand Down Expand Up @@ -263,7 +277,6 @@ def main():
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------


if __name__ == '__main__':
try:
main()
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='zpca',
version='0.3',
version='0.4',
description="PCA analysis for genes or transcripts.",
author="Foivos Gypas",
author_email='[email protected]',
Expand Down
33,975 changes: 0 additions & 33,975 deletions test_data/gene_expression.tsv

This file was deleted.

Loading

0 comments on commit 782dd65

Please sign in to comment.