release 3.5.0 (#990)

* Fixes for 3.4.2 * disable azure on OSX * Closes #945 * computeMatricOperation dataRange (#951) * Added dataRange to computeMatricOperation to return min,max,median and 10th and 90th percentile. * fixed pep8 * more pep8 fix * fixed test_sieve of azure tests * one more fix for test_sieve * imported pysam to test_readFiltering.py * updated hash values for test_sieve * fixed galaxy computeMatrixOperation.xml * More fixes for galaxy wrapper * fixed a little typo in bamCompare (#967) * save the output matrix of the plotheatmap in a format to be compatible with running plotheatmap on it again (#968) * Plot profile color map (#971) * Now colors can be set by user for any heatmap in plotProfile. A bug in tick index for plotting heatmap is also fixed. * added a small description * pep8 fix * added sortUsingSamples and clusterUsingSamples to the galaxy wrapper * [WIP] added auto to plotheatmap #908 (#982) * added auto to plotheatmap * fixed lint, added warning message, updated the help for zmin, zmax * galaxy test plotPCA * lower down the delat for potPCA galaxy test * fix #969 (#970) * fix #969 * PEP8 * updated changes.txt * fixed a little bug in parseCommon.py which caused flake8 failure. * [WIP] added auto to plotheatmap #908 (#982) * added auto to plotheatmap * fixed lint, added warning message, updated the help for zmin, zmax * galaxy test plotPCA * lower down the delat for potPCA galaxy test * updated version and changes * removed alpha from plotPCA test * removed compare="sim_size" * fixed plotHeatmap.xml by removing size from the the params and adding anitizer for the 2 new params. * upated change.txt * updated the doc html Co-authored-by: Devon Ryan <[email protected]>
deeptools · Aug 21, 2020 · 3bb56cc · 3bb56cc
1 parent 288311e
commit 3bb56cc
Show file tree

Hide file tree

Showing 13 changed files with 153 additions and 42 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,13 @@
+3.5.0
+
+ * Fixed a small issue in computeGCBias (issue #969)
+ * Added dataRange to computeMatricOperation to return min,max,median and 10th and 90th percentile.
+ * Fixed a small typo in bamCompare. (issue #966)
+ * Save the output matrix of the plotheatmap in a format to be compatible with running plotheatmap on it again.(issue #953)
+ * Different colors can now be set by user for plotProfile --plotType heatmap (issue #956)
+ * Added the `auto` option to the zMin and zMax of plotHeatmap. (issue #908)
+ * Added `--sortUsingSamples` and `--clusterUsingSamples` to the plotHeatmap galaxy wrapper. (issue #976)
+
 3.4.3
 
  * Changed iteritems() in estimateEscaleFactor to its python3 compatible items().

diff --git a/deeptools/_version.py b/deeptools/_version.py
@@ -2,4 +2,4 @@
 # This file is originally generated from Git information by running 'setup.py
 # version'. Distribution tarballs contain a pre-generated copy of this file.
 
-__version__ = '3.4.3'
+__version__ = '3.5.0'
diff --git a/deeptools/bamCompare.py b/deeptools/bamCompare.py
@@ -254,7 +254,7 @@ def main(args=None):
     if args.normalizeUsing == 'None':
         args.normalizeUsing = None  # For the sake of sanity
     if args.scaleFactorsMethod != 'None' and args.normalizeUsing:
-        sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))
+        sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing))
 
     # Get mapping statistics
     bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors)

diff --git a/deeptools/computeGCBias.py b/deeptools/computeGCBias.py
@@ -604,7 +604,7 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N
     plt.setp(bp['whiskers'], color='black', linestyle='dashed')
     plt.setp(bp['fliers'], marker='None')
     # get the whisker that spands the most
-    y_max = max([x.get_data()[1][1] for x in bp['whiskers']])
+    y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']])
     ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05)
     ax1.set_ylabel('Number of reads')
     ax1.set_xlabel('GC fraction')

diff --git a/deeptools/computeMatrixOperations.py b/deeptools/computeMatrixOperations.py
@@ -46,6 +46,9 @@ def parse_arguments():
 or
   computeMatrixOperations sort -h
 
+or
+  computeMatrixOperations dataRange -h
+
 """,
         epilog='example usages:\n'
                'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n'
@@ -126,6 +129,14 @@ def parse_arguments():
         help='Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.',
         usage='Example usage:\n  computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n')
 
+    # dataRange
+    subparsers.add_parser(
+        'dataRange',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        parents=[infoArgs()],
+        help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.',
+        usage='Example usage:\n  computeMatrixOperations dataRange -m input.mat.gz\n\n')
+
     parser.add_argument('--version', action='version',
                         version='%(prog)s {}'.format(__version__))
 
@@ -300,6 +311,22 @@ def printInfo(matrix):
         print("\t{0}".format(sample))
 
 
+def printDataRange(matrix):
+    """
+    Prints the min, max, median, 10th and 90th percentile of the matrix values per sample.
+    """
+    print("Samples\tMin\tMax\tMedian\t10th\t90th")
+    for i, sample in enumerate(matrix.matrix.sample_labels):
+        start = matrix.matrix.sample_boundaries[i]
+        end = matrix.matrix.sample_boundaries[i + 1]
+        sample_matrix = matrix.matrix.matrix[..., start:end]
+        print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sample, np.amin(sample_matrix),
+                                                    np.amax(sample_matrix),
+                                                    np.ma.median(sample_matrix),
+                                                    np.percentile(sample_matrix, 10),
+                                                    np.percentile(sample_matrix, 90)))
+
+
 def relabelMatrix(matrix, args):
     """
     Relabel the samples and groups in a matrix
@@ -771,6 +798,8 @@ def main(args=None):
 
     if args.command == 'info':
         printInfo(hm)
+    if args.command == 'dataRange':
+        printDataRange(hm)
     elif args.command == 'subset':
         sIdx = getSampleBounds(args, hm)
         gIdx, gBounds = getGroupBounds(args, hm)

diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py
@@ -239,10 +239,10 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
     try:
         if debug:
             endTime = time.time()
-            print("{}, processing {} ({:.1f} per sec) ")
-            "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
-                                      i, i / (endTime - startTime),
-                                      chrNameBit, start, end)
+            print("{}, processing {} ({:.1f} per sec) "
+                  "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
+                                            i, i / (endTime - startTime),
+                                            chrNameBit, start, end))
     except NameError:
         pass
 

diff --git a/deeptools/parserCommon.py b/deeptools/parserCommon.py
@@ -358,11 +358,10 @@ def numberOfProcessors(string):
                 "{} is not a valid number of processors".format(string))
 
         except Exception as e:
-            raise argparse.ArgumentTypeError("the value given is not valid. "
+            raise argparse.ArgumentTypeError("the given value {} is not valid. "
                                              "Error message: {}\nThe number of "
                                              "available processors in your "
-                                             "computer is {}.".format(string, e,
-                                                                      availProc))
+                                             "computer is {}.".format(string, e, availProc))
 
         if numberOfProcessors > availProc:
             numberOfProcessors = availProc
@@ -444,7 +443,7 @@ def heatmapperOutputArgs(args=None,
         output.add_argument('--outFileNameMatrix',
                             help='If this option is given, then the matrix '
                             'of values underlying the heatmap will be saved '
-                            'using this name, e.g. MyMatrix.tab.',
+                            'using this name, e.g. MyMatrix.gz.',
                             metavar='FILE',
                             type=writableFile)
 
@@ -614,9 +613,9 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]):
 
         optional.add_argument('--sortUsingSamples',
                               help='List of sample numbers (order as in matrix), '
-                              'that are used for sorting by --sortUsing, '
-                              'no value uses all samples, '
-                              'example: --sortUsingSamples 1 3',
+                              'which are used by --sortUsing for sorting. '
+                              'If no value is set, it uses all samples. '
+                              'Example: --sortUsingSamples 1 3',
                               type=int, nargs='+')
 
         optional.add_argument('--linesAtTickMarks',
@@ -704,15 +703,17 @@ def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]):
                               default=None,
                               help='Minimum value for the heatmap intensities. Multiple values, separated by '
                                    'spaces can be set for each heatmap. If the number of zMin values is smaller than'
-                                   'the number of heatmaps the values are recycled.',
-                              type=float,
+                                   'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set '
+                                   ' to the first percentile of the matrix values.',
+                              type=str,
                               nargs='+')
         optional.add_argument('--zMax', '-max',
                               default=None,
                               help='Maximum value for the heatmap intensities. Multiple values, separated by '
                                    'spaces can be set for each heatmap. If the number of zMax values is smaller than'
-                                   'the number of heatmaps the values are recycled.',
-                              type=float,
+                                   'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set '
+                                   ' to the 98th percentile of the matrix values.',
+                              type=str,
                               nargs='+')
         optional.add_argument('--heatmapHeight',
                               help='Plot height in cm. The default for the heatmap '

diff --git a/deeptools/plotHeatmap.py b/deeptools/plotHeatmap.py
@@ -412,6 +412,16 @@ def plotMatrix(hm, outFileName,
             zMin = [None]
         else:
             zMin = [zMin]  # convert to list to support multiple entries
+    elif 'auto' in zMin:
+        matrix_flatten = hm.matrix.flatten()
+        auto_min = np.percentile(matrix_flatten, 1.0)
+        if np.isnan(auto_min):
+            auto_min = None
+        new_mins = [float(x) if x != 'auto' else auto_min for x in zMin]
+        zMin = new_mins
+    else:
+        new_mins = [float(x) for x in zMin]
+        zMin = new_mins
 
     if zMax is None:
         if matrix_flatten is None:
@@ -422,6 +432,23 @@ def plotMatrix(hm, outFileName,
             zMax = [None]
         else:
             zMax = [zMax]
+    elif 'auto' in zMax:
+        matrix_flatten = hm.matrix.flatten()
+        auto_max = np.percentile(matrix_flatten, 98.0)
+        if np.isnan(auto_max):
+            auto_max = None
+        new_maxs = [float(x) if x != 'auto' else auto_max for x in zMax]
+        zMax = new_maxs
+    else:
+        new_maxs = [float(x) for x in zMax]
+        zMax = new_maxs
+    if (len(zMin) > 1) & (len(zMax) > 1):
+        for index, value in enumerate(zMax):
+            if value <= zMin[index]:
+                sys.stderr.write("Warnirng: In bigwig {}, the given zmin ({}) is larger than "
+                                 "or equal to the given zmax ({}). Thus, it has been set "
+                                 "to None. \n".format(index + 1, zMin[index], value))
+                zMin[index] = None
 
     if yMin is None:
         yMin = [None]
@@ -833,7 +860,7 @@ def main(args=None):
             hm.matrix.computeSilhouette(args.args.hclust)
 
     if args.outFileNameMatrix:
-        hm.save_matrix_values(args.outFileNameMatrix)
+        hm.save_matrix(args.outFileNameMatrix)
 
     if args.outFileSortedRegions:
         hm.save_BED(args.outFileSortedRegions)

diff --git a/deeptools/plotProfile.py b/deeptools/plotProfile.py
@@ -6,6 +6,7 @@
 
 import argparse
 import numpy as np
+from math import ceil
 import matplotlib
 matplotlib.use('Agg')
 matplotlib.rcParams['pdf.fonttype'] = 42
@@ -458,6 +459,13 @@ def plotly_hexbin(self):
         py.plot(fig, filename=self.out_file_name, auto_open=False)
 
     def plot_heatmap(self):
+        cmap = ['RdYlBu_r']
+        if self.color_list is not None:  # check the length to be equal to the numebr of plots otherwise multiply it!
+            cmap = self.color_list
+        if len(cmap) < self.numplots:
+            all_colors = cmap
+            for i in range(ceil(self.numplots / len(cmap))):
+                cmap.extend(all_colors)
         matrix_flatten = None
         if self.y_min == [None]:
             matrix_flatten = self.hm.matrix.flatten()
@@ -479,7 +487,6 @@ def plot_heatmap(self):
 
         ax_list = []
         # turn off y ticks
-
         for plot in range(self.numplots):
             labels = []
             col = plot % self.plots_per_row
@@ -503,9 +510,10 @@ def plot_heatmap(self):
 
             if self.per_group:
                 title = self.hm.matrix.group_labels[plot]
+                tickIdx = plot % self.hm.matrix.get_num_samples()
             else:
                 title = self.hm.matrix.sample_labels[plot]
-
+                tickIdx = plot
             ax.set_title(title)
             mat = []  # when drawing a heatmap (in contrast to drawing lines)
             for data_idx in range(self.numlines):
@@ -526,13 +534,12 @@ def plot_heatmap(self):
                     label = sub_matrix['group']
                 labels.append(label)
                 mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0))
-
             img = ax.imshow(np.vstack(mat), interpolation='nearest',
-                            cmap='RdYlBu_r', aspect='auto', vmin=localYMin, vmax=localYMax)
+                            cmap=cmap[plot], aspect='auto', vmin=localYMin, vmax=localYMax)
             self.fig.colorbar(img, cax=cax)
 
             totalWidth = np.vstack(mat).shape[1]
-            xticks, xtickslabel = self.getTicks(plot)
+            xticks, xtickslabel = self.getTicks(tickIdx)
             if np.ceil(max(xticks)) != float(totalWidth - 1):
                 tickscale = float(totalWidth) / max(xticks)
                 xticks_use = [x * tickscale for x in xticks]

diff --git a/deeptools/test/test_readFiltering.py b/deeptools/test/test_readFiltering.py
@@ -4,6 +4,8 @@
 import os.path
 from os import unlink
 import hashlib
+import pysam
+
 
 ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/"
 BAMFILE_FILTER = ROOT + "test_filtering.bam"
@@ -72,12 +74,12 @@ def test_sieve():
                 'test_filtering\t5\t193\n']
     assert_equal(resp, expected)
     unlink(outlog)
-    h = hashlib.md5(open(outfile, "rb").read()).hexdigest()
-    assert(h == "977bdab227a4dbfa3fc9f27c23a3e0b7")
+    h = hashlib.md5(pysam.view(outfile).encode('utf-8')).hexdigest()
+    assert(h == "acbc4443fb0387bfd6c412af9d4fc414")
     unlink(outfile)
 
-    h = hashlib.md5(open(outfiltered, "rb").read()).hexdigest()
-    assert(h == "762e79b7a2245ff6b2cea4139a1455de")
+    h1 = hashlib.md5(pysam.view(outfiltered).encode('utf-8')).hexdigest()
+    assert(h1 == "b90befdd5f073f14acb9a38661f301ad")
     unlink(outfiltered)
 
 

diff --git a/galaxy/wrapper/computeMatrixOperations.xml b/galaxy/wrapper/computeMatrixOperations.xml
@@ -71,6 +71,9 @@
                 -m $submodule.matrixFile
                 -R '#echo "' '".join($files)#'
                 -o $outFileName
+            #else if $submodule.command == "dataRange":
+                dataRange
+                -m $submodule.matrixFile
             #end if
 ]]>
     </command>
@@ -85,6 +88,7 @@
                 <option value="rbind">Bind matrices, top to bottom (rbind)</option>
                 <option value="cbind">Bind matrices, left to right (cbind)</option>
                 <option value="sort">Sort by region order in specified files (sort)</option>
+                <option value="dataRange">Returns the min, max, median, 10th and 90th percentile of the matrix values per sample (dataRange)</option>
             </param>
             <when value="info">
                 <param argument="matrixFile" format="deeptools_compute_matrix_archive" type="data"
@@ -151,6 +155,12 @@
                         help="File, in BED format, containing the regions to plot."/>
                 </repeat>
             </when>
+
+            <when value="dataRange">
+                <param argument="matrixFile" format="deeptools_compute_matrix_archive" type="data"
+                       label="Matrix file from the computeMatrix tool" help=""/>
+                </param>
+            </when>
         </conditional>
     </inputs>
     <outputs>
@@ -205,7 +215,8 @@ What it does
 +----------------+--------------------------------------------------------------------------------------------------------------------------+
 + sort           | Sorts the given file so regions are in the order of occurence in the input BED/GTF file(s).                              |
 +----------------+--------------------------------------------------------------------------------------------------------------------------+
-
++ dataRange      | Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.                                  |
++----------------+--------------------------------------------------------------------------------------------------------------------------+
 
 These operations are useful when you want to run computeMatrix on multiple files (thereby keeping all of the values together) and later exclude regions/samples or add new ones. Another common use would be if you require the output of computeMatrix to be sorted to match the order of regions in the input file.
 

diff --git a/galaxy/wrapper/deepTools_macros.xml b/galaxy/wrapper/deepTools_macros.xml
@@ -1,10 +1,10 @@
 <macros>
 
     <token name="@THREADS@">--numberOfProcessors "\${GALAXY_SLOTS:-4}"</token>
-    <token name="@WRAPPER_VERSION@">3.4.3.0</token>
+    <token name="@WRAPPER_VERSION@">3.5.0.0</token>
     <xml name="requirements">
         <requirements>
-            <requirement type="package" version="3.4.3">deeptools</requirement>
+            <requirement type="package" version="3.5.0">deeptools</requirement>
             <requirement type="package" version="1.9">samtools</requirement>
         </requirements>
         <expand macro="stdio" />