Skip to content

Commit

Permalink
v0.3.4
Browse files Browse the repository at this point in the history
  • Loading branch information
asistradition committed Mar 10, 2022
1 parent 2fc99aa commit 8b2167b
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 12 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
### Version 0.3.3

* Produce a filtered TF binding table when `--save_filtered_location_data` is set.

### Version 0.3.3

* Correctly produced a TF binding table when `--save_location_data` is set.

### Version 0.3.2
Expand Down
40 changes: 29 additions & 11 deletions inferelator_prior/network_from_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ def main():
shuffle=args.shuffle,
lowmem=not args.highmem,
intergenic_only=_intergenic,
save_locs=args.save_locs)
save_locs=args.save_locs,
save_locs_filtered=args.save_locs_filtered)

print("Prior matrix with {n} edges constructed".format(n=prior_matrix.sum().sum()))

Expand Down Expand Up @@ -88,6 +89,9 @@ def add_common_arguments(argp):
argp.add_argument("-o", "--out", dest="out", help="Output PATH prefix", metavar="PATH", default="./prior")
argp.add_argument("--save_location_data", dest="save_locs", help="Save a dataframe with TF->Gene binding locations",
action='store_const', const=True, default=False)
argp.add_argument("--save_filtered_location_data", dest="save_locs_filtered",
help="Save a dataframe with post-filter TF->Gene binding locations",
action='store_const', const=True, default=False)
argp.add_argument("-c", "--cpu", dest="cores", help="Number of cores", metavar="CORES", type=int, default=None)
argp.add_argument("--genes", dest="genes", help="A list of genes to build connectivity matrix for. Optional.",
default=None, type=str)
Expand Down Expand Up @@ -158,7 +162,8 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
truncate_prob=0.35, scanner_thresh="1e-4", motif_format="meme",
gene_constraint_list=None, regulator_constraint_list=None,
output_prefix=None, debug=False, fuzzy_motif_names=False, motif_info=None,
shuffle=None, lowmem=False, intergenic_only=True, save_locs=False):
shuffle=None, lowmem=False, intergenic_only=True, save_locs=False,
save_locs_filtered=False):
"""
Build a motif-based prior from windows around annotated genes.
Expand Down Expand Up @@ -210,14 +215,19 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
:type intergenic_only: bool
:param save_locs: Save motif mapping positions to a file, Defaults to False
:type save_locs: bool
:param save_locs_filtered: Save filtered TF -> Gene mapping locations to a file, Defaults to False
:type save_locs_filtered: bool
:return prior_matrix, raw_matrix, prior_data: Filtered connectivity matrix, unfiltered score matrix, and unfiltered
long dataframe with scored TF->Gene pairs and genomic locations
:rtype: pd.DataFrame, pd.DataFrame, pd.DataFrame
"""

if save_locs:
if save_locs and output_prefix is not None:
save_locs = output_prefix + "_tf_binding_locs.tsv"

if save_locs_filtered and output_prefix is not None:
save_locs_filtered = output_prefix + "_tf_binding_locs_filtered.tsv"

# PROCESS GENE ANNOTATIONS #########################################################################################

print("Loading genes from file ({f})".format(f=annotation_file))
Expand Down Expand Up @@ -263,7 +273,8 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
# PROCESS SCORES INTO NETWORK ##################################################################################
print("{n} regulatory edges identified by motif search".format(n=(raw_matrix != 0).sum().sum()))

return network_build(raw_matrix, prior_data, num_cores=num_cores, output_prefix=output_prefix, debug=debug)
return network_build(raw_matrix, prior_data, num_cores=num_cores, output_prefix=output_prefix, debug=debug,
save_locs_filtered=save_locs_filtered)

else:
MotifScan.set_type(scanner_type)
Expand Down Expand Up @@ -296,10 +307,10 @@ def network_scan_build_single_tf(tf_mi_df):
columns=tf_mi_df[MOTIF_NAME_COL].unique().tolist())
pr_da = None

if save_locs:
return network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True), motif_peaks
else:
return network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True)
raw_loc = motif_peaks if save_locs else None
net_results = network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True)

return net_results, raw_loc

# MULTIPROCESS PER-TF ##########################################################################################
prior_matrix, raw_matrix, prior_data = [], [], []
Expand All @@ -310,9 +321,8 @@ def network_scan_build_single_tf(tf_mi_df):

if save_locs:
res[1].to_csv(save_locs, sep="\t", mode="w" if i == 0 else "a", header=i == 0)
res = res[0]

p_m, r_m, p_d = res
p_m, r_m, p_d = res[0]

print("Processed TF {i}/{n}".format(i=i, n=len(motif_information)))
prior_matrix.append(p_m)
Expand All @@ -337,6 +347,9 @@ def network_scan_build_single_tf(tf_mi_df):
print("Writing output file {o}".format(o=output_prefix + "_edge_matrix.tsv.gz"))
(prior_matrix != 0).astype(int).to_csv(output_prefix + "_edge_matrix.tsv.gz", sep="\t")

if save_locs_filtered:
prior_data.to_csv(save_locs_filtered, sep="\t")

return prior_matrix, raw_matrix, prior_data


Expand Down Expand Up @@ -419,7 +432,8 @@ def network_scan(motifs, motif_information, genes, genomic_fasta_file, constrain
return raw_matrix, prior_data


def network_build(raw_matrix, prior_data, num_cores=1, output_prefix=None, debug=False, silent=False):
def network_build(raw_matrix, prior_data, num_cores=1, output_prefix=None, debug=False, silent=False,
save_locs_filtered=False):

if output_prefix is not None:
print("Writing output file {o}".format(o=output_prefix + "_unfiltered_matrix.tsv.gz"))
Expand All @@ -437,6 +451,10 @@ def network_build(raw_matrix, prior_data, num_cores=1, output_prefix=None, debug
pm_melt = prior_matrix.reset_index().melt(id_vars=PRIOR_GENE, var_name=PRIOR_TF, value_name='Filter_Included')
prior_data = pd.merge(prior_data, pm_melt)

if save_locs_filtered and output_prefix is not None:
print(f"Writing output file {save_locs_filtered}")
prior_data.to_csv(save_locs_filtered, sep="\t")

return prior_matrix, raw_matrix, prior_data


Expand Down
69 changes: 69 additions & 0 deletions inferelator_prior/tests/test_network_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ def test_file_output(self):
self.assertTrue(os.path.exists(temp_path_prefix + "_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "_edge_matrix.tsv.gz"))
self.assertFalse(os.path.exists(temp_path_prefix + "_tf_binding_locs.tsv"))
self.assertFalse(os.path.exists(temp_path_prefix + "_tf_binding_locs_filtered.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
Expand Down Expand Up @@ -145,3 +146,71 @@ def test_file_output(self):
self.assertTrue(os.path.exists(temp_path_prefix + "c_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "c_edge_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "c_tf_binding_locs.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "d",
save_locs_filtered=True,
lowmem=True)

self.assertTrue(os.path.exists(temp_path_prefix + "d_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "d_edge_matrix.tsv.gz"))
self.assertFalse(os.path.exists(temp_path_prefix + "d_tf_binding_locs.tsv"))
self.assertTrue(os.path.exists(temp_path_prefix + "d_tf_binding_locs_filtered.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "e",
save_locs_filtered=True)

self.assertTrue(os.path.exists(temp_path_prefix + "e_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "e_edge_matrix.tsv.gz"))
self.assertFalse(os.path.exists(temp_path_prefix + "e_tf_binding_locs.tsv"))
self.assertTrue(os.path.exists(temp_path_prefix + "e_tf_binding_locs_filtered.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "f",
save_locs_filtered=True,
save_locs=True,
lowmem=True)

self.assertTrue(os.path.exists(temp_path_prefix + "f_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "f_edge_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "f_tf_binding_locs.tsv"))
self.assertTrue(os.path.exists(temp_path_prefix + "f_tf_binding_locs_filtered.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "g",
save_locs_filtered=True,
save_locs=True)

self.assertTrue(os.path.exists(temp_path_prefix + "g_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "g_edge_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "g_tf_binding_locs.tsv"))
self.assertTrue(os.path.exists(temp_path_prefix + "g_tf_binding_locs_filtered.tsv"))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

install_requires = ["numpy", "pandas>=1.0", "HTSeq", "pybedtools", "scipy", "pathos", "sklearn", "tqdm"]
tests_require = ["coverage", "nose", "pysam"]
version = "0.3.3"
version = "0.3.4"

# Description from README.md
base_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down

0 comments on commit 8b2167b

Please sign in to comment.