calico · davek44 · Jun 30, 2018 · Jun 30, 2018 · Jul 1, 2018 · Jul 1, 2018
diff --git a/basenji/augmentation.py b/basenji/augmentation.py
@@ -0,0 +1,195 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     https://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========================================================================
+
+import pdb
+import tensorflow as tf
+
+from basenji import ops
+
+def shift_sequence(seq, shift_amount, pad_value=0.25):
+  """Shift a sequence left or right by shift_amount.
+
+  Args:
+    seq: a [batch_size, sequence_length, sequence_depth] sequence to shift
+    shift_amount: the signed amount to shift (tf.int32 or int)
+    pad_value: value to fill the padding (primitive or scalar tf.Tensor)
+  """
+  if seq.shape.ndims != 3:
+    raise ValueError('input sequence should be rank 3')
+  input_shape = seq.shape
+
+  pad = pad_value * tf.ones_like(seq[:, 0:tf.abs(shift_amount), :])
+
+  def _shift_right(_seq):
+    sliced_seq = _seq[:, :-shift_amount:, :]
+    return tf.concat([pad, sliced_seq], axis=1)
+
+  def _shift_left(_seq):
+    sliced_seq = _seq[:, -shift_amount:, :]
+    return tf.concat([sliced_seq, pad], axis=1)
+
+  output = tf.cond(
+      tf.greater(shift_amount, 0), lambda: _shift_right(seq),
+      lambda: _shift_left(seq))
+
+  output.set_shape(input_shape)
+  return output
+
+def augment_deterministic_set(data_ops, augment_rc=False, augment_shifts=[0]):
+  """
+
+  Args:
+    data_ops: dict with keys 'sequence,' 'label,' and 'na.'
+    augment_rc: Boolean
+    augment_shifts: List of ints.
+  Returns
+    data_ops_list:
+  """
+  augment_pairs = []
+  for ashift in augment_shifts:
+    augment_pairs.append((False, ashift))
+    if augment_rc:
+      augment_pairs.append((True, ashift))
+
+  data_ops_list = []
+  for arc, ashift in augment_pairs:
+    data_ops_aug = augment_deterministic(data_ops, arc, ashift)
+    data_ops_list.append(data_ops_aug)
+
+  return data_ops_list
+
+
+def augment_deterministic(data_ops, augment_rc=False, augment_shift=0):
+  """Apply a deterministic augmentation, specified by the parameters.
+
+  Args:
+    data_ops: dict with keys 'sequence,' 'label,' and 'na.'
+    augment_rc: Boolean
+<<<<<<< HEAD
+    augment_shift: Int
+  Returns
+    data_ops: augmented data, with all existing keys transformed
+              and 'reverse_preds' bool added.
+  """
+
+  data_ops_aug = {}
+  if 'label' in data_ops:
+    data_ops_aug['label'] = data_ops['label']
+  if 'na' in data_ops:
+    data_ops_aug['na'] = data_ops['na']
+=======
+    augment_shifts: Int
+  Returns
+    data_ops: augmented data
+  """
+
+  data_ops_aug = {'label': data_ops['label'], 'na': data_ops['na']}
+>>>>>>> 29dd294bf104eb6f38559a6665fc2ff7d233afc9
+
+  if augment_shift == 0:
+    data_ops_aug['sequence'] = data_ops['sequence']
+  else:
+    shift_amount = tf.constant(augment_shift, shape=(), dtype=tf.int64)
+    data_ops_aug['sequence'] = shift_sequence(data_ops['sequence'], shift_amount)
+
+  if augment_rc:
+    data_ops_aug = augment_deterministic_rc(data_ops_aug)
+  else:
+    data_ops_aug['reverse_preds'] = tf.zeros((), dtype=tf.bool)
+
+  return data_ops_aug
+
+
+def augment_deterministic_rc(data_ops):
+  """Apply a deterministic reverse complement augmentation.
+
+  Args:
+    data_ops: dict with keys 'sequence,' 'label,' and 'na.'
+  Returns
+    data_ops_aug: augmented data ops
+  """
+<<<<<<< HEAD
+  data_ops_aug = ops.reverse_complement_transform(data_ops)
+  data_ops_aug['reverse_preds'] = tf.ones((), dtype=tf.bool)
+=======
+  seq, label, na = [data_ops[k] for k in ['sequence', 'label', 'na']]
+  seq, label, na = ops.reverse_complement_transform(seq, label, na)
+  reverse_preds = tf.ones((), dtype=tf.bool)
+  data_ops_aug = {'sequence': seq, 'label': label, 'na': na, 'reverse_preds':reverse_preds}
+>>>>>>> 29dd294bf104eb6f38559a6665fc2ff7d233afc9
+  return data_ops_aug
+
+
+def augment_stochastic_rc(data_ops):
+  """Apply a stochastic reverse complement augmentation.
+
+  Args:
+    data_ops: dict with keys 'sequence,' 'label,' and 'na.'
+  Returns
+    data_ops_aug: augmented data
+  """
+<<<<<<< HEAD
+  reverse_preds = tf.random_uniform(shape=[]) > 0.5
+  data_ops_aug = tf.cond(reverse_preds, lambda: ops.reverse_complement_transform(data_ops),
+                                        lambda: data_ops.copy())
+  data_ops_aug['reverse_preds'] = reverse_preds
+=======
+  seq, label, na = [data_ops[k] for k in ['sequence', 'label', 'na']]
+  reverse_preds = tf.random_uniform(shape=[]) > 0.5
+  seq, label, na = tf.cond(reverse_preds, lambda: ops.reverse_complement_transform(seq, label, na),
+                                          lambda: (seq, label, na))
+  data_ops_aug = {'sequence': seq, 'label': label, 'na': na, 'reverse_preds':reverse_preds}
+>>>>>>> 29dd294bf104eb6f38559a6665fc2ff7d233afc9
+  return data_ops_aug
+
+
+def augment_stochastic_shifts(seq, augment_shifts):
+  """Apply a stochastic shift augmentation.
+
+  Args:
+    seq: input sequence of size [batch_size, length, depth]
+    augment_shifts: list of int offsets to sample from
+  Returns:
+    shifted and padded sequence of size [batch_size, length, depth]
+  """
+  shift_index = tf.random_uniform(shape=[], minval=0,
+      maxval=len(augment_shifts), dtype=tf.int64)
+  shift_value = tf.gather(tf.constant(augment_shifts), shift_index)
+
+  seq = tf.cond(tf.not_equal(shift_value, 0),
+                lambda: shift_sequence(seq, shift_value),
+                lambda: seq)
+
+  return seq
+
+
+def augment_stochastic(data_ops, augment_rc=False, augment_shifts=[]):
+  """Apply stochastic augmentations,
+
+  Args:
+    data_ops: dict with keys 'sequence,' 'label,' and 'na.'
+    augment_rc: Boolean for whether to apply reverse complement augmentation.
+    augment_shifts: list of int offsets to sample shift augmentations.
+  Returns:
+    data_ops_aug: augmented data
+  """
+  if augment_shifts:
+    data_ops['sequence'] = augment_stochastic_shifts(data_ops['sequence'],
+                                                     augment_shifts)
+
+  if augment_rc:
+    data_ops = augment_stochastic_rc(data_ops)
+  else:
+    data_ops['reverse_preds'] = tf.zeros((), dtype=tf.bool)
+
+  return data_ops
diff --git a/basenji/batcher.py b/basenji/batcher.py
@@ -84,19 +84,19 @@ def next(self, fwdrc=True, shift=0):
 
       # initialize
       Xb = np.zeros(
-          (self.batch_size, self.seq_len, self.seq_depth), dtype='float32')
+          (Nb, self.seq_len, self.seq_depth), dtype='float32')
       if self.Yf is not None:
         if self.Yf.dtype == np.uint8:
           ytype = 'int32'
         else:
           ytype = 'float32'
 
         Yb = np.zeros(
-            (self.batch_size, self.seq_len // self.pool_width,
+            (Nb, self.seq_len // self.pool_width,
              self.num_targets),
             dtype=ytype)
         NAb = np.zeros(
-            (self.batch_size, self.seq_len // self.pool_width), dtype='bool')
+            (Nb, self.seq_len // self.pool_width), dtype='bool')
 
       # copy data
       for i in range(Nb):

diff --git a/basenji/ops.py b/basenji/ops.py
@@ -41,15 +41,34 @@ def adjust_max(start, stop, start_value, stop_value, name=None):
     else:
       return None
 
-def reverse_complement_transform(seq, label, na):
+def reverse_complement_transform(data_ops):
   """Reverse complement of batched onehot seq and corresponding label and na."""
+
+  # initialize reverse complemented data_ops
+  data_ops_rc = {}
+
+  # extract sequence from dict
+  seq = data_ops['sequence']
+
+  # check rank
   rank = seq.shape.ndims
   if rank != 3:
     raise ValueError("input seq must be rank 3.")
 
-  complement = tf.gather(seq, [3, 2, 1, 0], axis=-1)
-  return (tf.reverse(complement, axis=[1]), tf.reverse(label, axis=[1]),
-          tf.reverse(na, axis=[1]))
+  # reverse complement sequence
+  seq_rc = tf.gather(seq, [3, 2, 1, 0], axis=-1)
+  seq_rc = tf.reverse(seq_rc, axis=[1])
+  data_ops_rc['sequence'] = seq_rc
+
+  # reverse labels
+  if 'label' in data_ops:
+    data_ops_rc['label'] = tf.reverse(data_ops['label'], axis=[1])
+
+  # reverse NA
+  if 'na' in data_ops:
+    data_ops_rc['na'] = tf.reverse(data_ops['na'], axis=[1])
+
+  return data_ops_rc
 
 
 def reverse_complement(input_seq, lengths=None):