Skip to content

Commit

Permalink
Merge pull request #33 from mdshw5/mutable_FastaRecord
Browse files Browse the repository at this point in the history
Mutable fasta record
  • Loading branch information
mdshw5 committed Nov 20, 2014
2 parents f4b73ed + bc84f78 commit 6160c2d
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 42 deletions.
38 changes: 38 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,40 @@ Or just get a Python string:
>>> genes['NM_001282543.1'][200:230]
CTCGTTCCGCGCCCGCCATGGAACCGGATG
You can also perform line-based iteration, receiving the sequence lines as they appear in the FASTA file:

.. code:: python
>>> from pyfaidx import Fasta
>>> genes = Fasta('tests/data/genes.fasta')
>>> for line in genes['NM_001282543.1']:
... print(line)
CCCCGCCCCTCTGGCGGCCCGCCGTCCCAGACGCGGGAAGAGCTTGGCCGGTTTCGAGTCGCTGGCCTGC
AGCTTCCCTGTGGTTTCCCGAGGCTTCCTTGCTTCCCGCTCTGCGAGGAGCCTTTCATCCGAAGGCGGGA
CGATGCCGGATAATCGGCAGCCGAGGAACCGGCAGCCGAGGATCCGCTCCGGGAACGAGCCTCGTTCCGC
...
.. role:: red

If you want to modify the contents of your FASTA file in-place, you can use the `mutable` argument.
Any portion of the FastaRecord can be replaced with an equivalent-length string.
:red:`Warning`: *This will change the contents of your file immediately and permanently:*

.. code:: python
>>> genes = Fasta('tests/data/genes.fasta', mutable=True)
>>> type(genes['NM_001282543.1'])
<class 'pyfaidx.MutableFastaRecord'>
>>> genes['NM_001282543.1'][:10]
>NM_001282543.1:1-10
CCCCGCCCCT
>>> genes['NM_001282543.1'][:10] = 'NNNNNNNNNN'
>>> genes['NM_001282543.1'][:15]
>NM_001282543.1:1-15
NNNNNNNNNNCTGGC
It also provides a command-line script:

cli script: faidx
Expand Down Expand Up @@ -228,6 +262,10 @@ A lower-level Faidx class is also available:

Changes
-------
*New in version 0.3.0*:

- FastaRecord now works as a line-based iterator (`#30 <https://github.com/mdshw5/pyfaidx/issues/30>`_)
- Added MutableFastaRecord class that allows same-length in-place replacement for FASTA (`#29 <https://github.com/mdshw5/pyfaidx/issues/29>`_)

*New in version 0.2.9*:

Expand Down
134 changes: 96 additions & 38 deletions pyfaidx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from __future__ import division
import sys
import os
import re
import warnings
from six import PY2, PY3, string_types
from six.moves import zip_longest
try:
Expand Down Expand Up @@ -168,7 +170,8 @@ def __str__(self):
class Faidx(object):
""" A python implementation of samtools faidx FASTA indexing """
def __init__(self, filename, default_seq=None, key_function=None,
as_raw=False, strict_bounds=False, read_ahead=None):
as_raw=False, strict_bounds=False, read_ahead=None,
mutable=False):
"""
filename: name of fasta file
key_function: optional callback function which should return a unique
Expand All @@ -178,7 +181,11 @@ def __init__(self, filename, default_seq=None, key_function=None,
Default: False (i.e. return a Sequence() object).
"""
self.filename = filename
self.file = open(filename, 'rb')
if mutable:
self.file = open(filename, 'rb+')
warnings.warn("FASTA mutability is experimental. Use it carefully!", FutureWarning)
else:
self.file = open(filename, 'rb')
self.indexname = filename + '.fai'
self.key_function = key_function if key_function else lambda rname: rname
self.as_raw = as_raw
Expand All @@ -187,6 +194,7 @@ def __init__(self, filename, default_seq=None, key_function=None,
self.index = OrderedDict()
self.buffer = None
self.read_ahead = read_ahead
self.mutable = mutable

if os.path.exists(self.indexname):
self.read_fai()
Expand Down Expand Up @@ -275,7 +283,7 @@ def build_index(self):
clen = line_clen
rlen += line_clen
elif (line[0] == '>') and (rname is not None):
indexfile.write("{rname}\t{rlen:n}\t{thisoffset:n}\t{clen:n}\t{blen:n}\n".format(**locals()))
indexfile.write("{rname}\t{rlen:d}\t{thisoffset:d}\t{clen:d}\t{blen:d}\n".format(**locals()))
blen = 0
rlen = 0
clen = 0
Expand All @@ -284,7 +292,7 @@ def build_index(self):
offset += line_blen
thisoffset = offset
line_number += 1
indexfile.write("{rname}\t{rlen:n}\t{thisoffset:n}\t{clen:n}\t{blen:n}\n".format(**locals()))
indexfile.write("{rname}\t{rlen:d}\t{thisoffset:d}\t{clen:d}\t{blen:d}\n".format(**locals()))

def write_fai(self):
with open(self.indexname, 'w') as outfile:
Expand All @@ -304,50 +312,52 @@ def from_buffer(self, name, start, end):
def fill_buffer(self, name, start, end):
q_len = end - start
if q_len > self.read_ahead:
return self.from_file(name, start, end)
return self.format_seq(name, start, end)
try:
seq = self.from_file(name, start, start + self.read_ahead)
seq = self.format_seq(name, start, start + self.read_ahead)
if not self.as_raw:
self.buffer = seq
elif self.as_raw:
self.buffer = Sequence(name=name, start=int(start),
end=int(end), seq=seq)
return self.from_buffer(name, start, end)
except FetchError:
return self.from_file(name, start, end)
return self.format_seq(name, start, end)

def fetch(self, name, start, end):
if not self.read_ahead:
return self.from_file(name, start, end)
return self.format_seq(name, start, end)
# get sequence from read-ahead buffer if it exists
if self.read_ahead and (name, start, end) in self:
return self.from_buffer(name, start, end)
elif self.read_ahead:
return self.fill_buffer(name, start, end)

def from_file(self, rname, start, end):
def from_file(self, rname, start, end, internals=False):
""" Fetch the sequence ``[start:end]`` from ``rname`` using 1-based coordinates
1. Count newlines before start
2. Count newlines to end
3. Difference of 1 and 2 is number of newlines in [start:end]
4. Seek to start position, taking newlines into account
5. Read to end position, return sequence without newlines
5. Read to end position, return sequence
"""
try:
i = self.index[rname]
except KeyError:
raise FetchError("Requested rname {0} does not exist! "
"Please check your FASTA file.".format(rname))
start = start - 1 # make coordinates [0,1)
seq_len = end - start
start0 = start - 1 # make coordinates [0,1)
if start0 < 0:
raise FetchError("Requested start coordinate must be greater than 1.")
seq_len = end - start0

newlines_total = int(i.rlen / i.lenc * (i.lenb - i.lenc))
newlines_before = int((start - 1) / i.lenc *
(i.lenb - i.lenc)) if start > 0 else 0
newlines_before = int((start0 - 1) / i.lenc *
(i.lenb - i.lenc)) if start0 > 0 else 0
newlines_to_end = int(end / i.lenc * (i.lenb - i.lenc))
newlines_inside = newlines_to_end - newlines_before
seq_blen = newlines_inside + seq_len
bstart = i.offset + newlines_before + start
bstart = i.offset + newlines_before + start0
bend = i.offset + newlines_total + i.rlen
self.file.seek(bstart)

Expand All @@ -357,28 +367,46 @@ def from_file(self, rname, start, end):
raise FetchError("Requested end coordinate {0:n} outside of {1}. "
"\n".format(end, rname))
if seq_blen > 0:
seq = self.file.read(seq_blen).decode().replace('\n', '')
seq = self.file.read(seq_blen).decode()
elif seq_blen <= 0 and not self.strict_bounds:
if self.as_raw:
return ''
else:
return Sequence(name=rname, start=0, end=0)
seq = ''
elif seq_blen <= 0 and self.strict_bounds:
raise FetchError("Requested coordinates start={0:n} end={1:n} are "
"invalid.\n".format(start + 1, end))
"invalid.\n".format(start, end))
if not internals:
return seq
else:
return (seq, locals())

if len(seq) < end - start and self.default_seq: # Pad missing positions with default_seq
pad_len = end - start - len(seq)
def format_seq(self, rname, start, end):
seq = self.from_file(rname, start, end).replace('\n', '')
start0 = start - 1
if len(seq) < end - start0 and self.default_seq: # Pad missing positions with default_seq
pad_len = end - start0 - len(seq)
seq = ''.join([seq, pad_len * self.default_seq])
else: # Return less than requested range
end = start + len(seq)
end = start0 + len(seq)

if self.as_raw:
return seq
else:
return Sequence(name=rname, start=int(start + 1),
return Sequence(name=rname, start=int(start),
end=int(end), seq=seq)

def to_file(self, rname, start, end, seq):
""" Write sequence in region from start-end, overwriting current
contents of the FASTA file. """
if not self.mutable:
raise IOError("Write attempted for immutable Faidx instance. Set mutable=True to modify original FASTA.")
file_seq, internals = self.from_file(rname, start, end, internals=True)
file_newline_pos = [match.start() for match in re.finditer('\n', file_seq)]
seq = list(seq)
if len(seq) != len(file_seq) - len(file_newline_pos):
raise IOError("Specified replacement sequence needs to have the same length as original.")
elif len(seq) == len(file_seq) - len(file_newline_pos):
any(seq.insert(i, '\n') for i in file_newline_pos)
self.file.seek(internals['bstart'])
self.file.write(''.join(seq).encode())

def __enter__(self):
return self
Expand Down Expand Up @@ -439,43 +467,73 @@ def __str__(self):
return str(self[:])


class MutableFastaRecord(FastaRecord):
def __init__(self, name, fa):
super(MutableFastaRecord, self).__init__(name, fa)

def __setitem__(self, n, value):
"""Mutate sequence in region [start, end)
to value.
Coordinates are 0-based, end-exclusive."""
try:
if isinstance(n, slice):
start, stop, step = n.start, n.stop, n.step
if step:
raise IndexError("Step operator is not implemented.")
if not start:
start = 0
if not stop:
stop = len(self)
if stop < 0:
stop = len(self) + stop
if start < 0:
start = len(self) + start
self._fa.faidx.to_file(self.name, start + 1, stop, value)

elif isinstance(n, int):
if n < 0:
n = len(self) + n
return self._fa.faidx.to_file(self.name, n + 1, n + 1, value)
except (FetchError, IOError):
raise


class Fasta(object):
def __init__(self, filename, default_seq=None, key_function=None, as_raw=False, strict_bounds=False, read_ahead=None):
def __init__(self, filename, default_seq=None, key_function=None, as_raw=False, strict_bounds=False, read_ahead=None, mutable=False):
"""
An object that provides a pygr compatible interface.
filename: name of fasta file
default_seq: if given, this base will always be returned if region is unavailable.
key_function: optional callback function which should return a unique key for the self._records dictionary when given rname.
key_function: optional callback function which should return a unique key when given rname.
as_raw: optional parameter to specify whether to return sequences as a Sequence() object or as a raw string. Default: False (i.e. return a Sequence() object).
"""
self.filename = filename
self.mutable = mutable
self.faidx = Faidx(filename, key_function=key_function, as_raw=as_raw,
default_seq=default_seq, strict_bounds=strict_bounds,
read_ahead=read_ahead)
read_ahead=read_ahead, mutable=mutable)

def __contains__(self, rname):
"""Return True if genome contains record."""
return rname in self.keys()

def __getitem__(self, rname):
"""Return a chromosome by its name."""
return FastaRecord(rname, self)
if rname in self:
if not self.mutable:
return FastaRecord(rname, self)
else:
return MutableFastaRecord(rname, self)
else:
raise KeyError("{0} not in {1}.".format(rname, self.filename))

def __repr__(self):
if self.faidx.as_raw:
return 'Fasta("%s", as_raw=True)' % (self.filename)
else:
return 'Fasta("%s")' % (self.filename)
return 'Fasta("%s")' % (self.filename)

def __iter__(self):
for key in self.keys():
yield self[key]

@property
def _records(self):
return OrderedDict((rname, FastaRecord(rname, self)) for
rname in self.keys())

def keys(self):
return self.faidx.index.keys()

Expand Down
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
setup(
name='pyfaidx',
provides='pyfaidx',
version='0.2.9',
version='0.3.0',
author='Matthew Shirley',
author_email='[email protected]',
url='http://mattshirley.com',
Expand All @@ -27,9 +27,10 @@
"Intended Audience :: Science/Research",
"Natural Language :: English",
"Operating System :: Unix",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.3",
"Programming Language :: Python :: 3.2",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 2.6",
"Programming Language :: Python :: Implementation :: PyPy",
"Topic :: Scientific/Engineering :: Bio-Informatics"
Expand Down
30 changes: 30 additions & 0 deletions tests/test_MutableFastaRecord.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
from pyfaidx import Fasta
from tempfile import NamedTemporaryFile

path = os.path.dirname(__file__)
os.chdir(path)

class TestMutableFastaRecord:
def __init__(self):
self.genes = os.path.join(path, 'data/genes.fasta')
self.fasta = Fasta(self.genes)

def setup(self):
self.genes_copy = NamedTemporaryFile(mode='rb+')
self.genes_copy.write(open(self.genes, 'rb').read())
self.genes_copy.seek(0)
self.mutable_fasta = Fasta(self.genes_copy.name, mutable=True)

def teardown(self):
self.genes_copy.close() # deletes temporary file

def test_mutate_fasta_to_same(self):
chunk = self.fasta['KF435150.1'][0:100]
self.mutable_fasta['KF435150.1'][0:100] = chunk.seq
assert str(self.fasta['KF435150.1']) == str(self.mutable_fasta['KF435150.1'])

def test_mutate_fasta_to_N(self):
chunk = 100 * 'N'
self.mutable_fasta['KF435150.1'][0:100] = chunk
assert self.mutable_fasta['KF435150.1'][0:100].seq == chunk
2 changes: 1 addition & 1 deletion tests/test_feature_default_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
path = os.path.dirname(__file__)
os.chdir(path)

class TestFeatureBoundsCheck:
class TestFeatureDefaultSeq:
def __init__(self):
self.fasta = os.path.join(path, 'data/genes.fasta')
self.faidx = Faidx(self.fasta, default_seq='N')
Expand Down

0 comments on commit 6160c2d

Please sign in to comment.