Skip to content

Commit

Permalink
Bug fixes and minor feature updates (#155)
Browse files Browse the repository at this point in the history
* Added AminoAcidFrequencyDistribution report: plots a barplot of each amino acid in each position of all sequences in a dataset (any dataset type)

* small fix in SequenceLengthDistribution (set output_written=False to ensure correct error message, since this report writes no data file)

* Updated color palette

* updated AminoAcidFrequencyDistribution to include splitting by label values

* Updated docs

* bugfix matches report: get subject ids

* add top/bottom n and filtering to FeatureValueBarplot

* Allow metrics to be computed during MLApplication if the same label is provided

* small fix to make tests pass

* fix: html was overwritten

* bugfix: dont access _proba columns when not defined

* bugfix: convert everything to string

* add RepertoireClonotypeSummary report

* add default params to RepertoireClonotypeSummary

* add ReferenceSequenceAnnotator preprocessor

* fix: make ClonesPerRepertoireFilter limits inclusive (as per docs)

* imports: check explicitly if to import stop codon

* imports: add log on removed seqs

* add more logging for DatasetExportInstruction

* support diff region type when calling CompAIRR

* fix: DatasetExport result path

* bugfix: make dataset from new repertoires in ClonesPerRepertoireFilter

* fix: don't export field_list to metadata

* add SequenceLengthFilter

* fix filenames in AIRRExporter

* add subject ids as a column to Matches output

* add nicer feature names to MatchedSequencesEncoder

* minor updates 'Report' class
- allow different plot callables (useful for reports with multiple plots)
- generic function for writing output tables, as this type of code is repeated in many reports

* ParameterValidator:
- add min_exclusive and max_exclusive (in addition to inclusive)
- add assert_valid_tabular_file to check if a file exists and contains the expected columns

* support seq len dist for sequence datasets

* remove internal cv in outer assessment loop for sklearn

* update ref seq annotator

* explicitly order columns when exporting repertoires for CompAIRR

* fix: receptor superclass call

* fix: add missing params to CompAIRRParams in CompAIRRSequenceAbundanceEncoder

* fix: column names for matching in CompAIRRSequenceAbundanceEncoder

* fix: path for importing repertoire files

* fix tests

* fix tests and dependencies, update version

* fix AIRRImport with region type

---------

Co-authored-by: Lonneke Scheffer <[email protected]>
Co-authored-by: Theodor Bergersen <[email protected]>
  • Loading branch information
3 people authored May 25, 2023
1 parent 32a0fc5 commit 0991c63
Show file tree
Hide file tree
Showing 90 changed files with 1,537 additions and 389 deletions.
5 changes: 3 additions & 2 deletions immuneML/IO/dataset_export/AIRRExporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def export(dataset: Dataset, path: Path, number_of_processes: int = 1):
def export_repertoire(repertoire: Repertoire, repertoire_path: Path):
df = AIRRExporter._repertoire_to_dataframe(repertoire)
df = AIRRExporter._postprocess_dataframe(df)
output_file = repertoire_path / f"{repertoire.data_filename.stem}.tsv"
output_file = repertoire_path / f"{repertoire.data_filename.stem if 'subject_id' not in repertoire.metadata else repertoire.metadata['subject_id']}.tsv"
airr.dump_rearrangement(df, str(output_file))

@staticmethod
Expand All @@ -89,7 +89,8 @@ def get_sequence_aa_field(region_type):
def export_updated_metadata(dataset: RepertoireDataset, result_path: Path, repertoire_folder: str):
df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN)
identifiers = df["identifier"].values.tolist() if "identifier" in df.columns else dataset.get_example_ids()
df["filename"] = [str(Path(repertoire_folder) / f"{repertoire.data_filename.stem}.tsv") for repertoire in dataset.get_data()]
df["filename"] = [f"{repertoire.data_filename.stem if 'subject_id' not in repertoire.metadata else repertoire.metadata['subject_id']}.tsv"
for repertoire in dataset.get_data()]
df['identifier'] = identifiers
df.to_csv(result_path / "metadata.csv", index=False)

Expand Down
4 changes: 3 additions & 1 deletion immuneML/IO/dataset_import/AIRRImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,14 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
df.loc[:, "region_types"] = params.region_type.name
elif "junction" in params.column_mapping or "junction_aa" in params.column_mapping:
ImportHelper.junction_to_cdr3(df, params.region_type)
else:
df.loc[:, 'region_types'] = params.region_type.name
else:
df.loc[:, "region_types"] = params.region_type.name
# todo else: support "full_sequence" import through regiontype?

ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, import_with_stop_codon=params.import_with_stop_codon)
ImportHelper.update_gene_info(df)
ImportHelper.load_chains(df)

Expand Down
2 changes: 1 addition & 1 deletion immuneML/IO/dataset_import/GenericImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def import_dataset(params: dict, dataset_name: str) -> Dataset:
@staticmethod
def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)
ImportHelper.junction_to_cdr3(df, params.region_type)
df.loc[:, "region_types"] = params.region_type.name
ImportHelper.update_gene_info(df)
Expand Down
2 changes: 1 addition & 1 deletion immuneML/IO/dataset_import/IGoRImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
df.loc[:, "region_types"] = params.region_type.name
# note: import_empty_aa_sequences is set to true here; since IGoR doesnt output aa, this parameter is insensible
ImportHelper.drop_empty_sequences(df, True, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)

# chain or at least receptorsequence?

Expand Down
2 changes: 1 addition & 1 deletion immuneML/IO/dataset_import/MiXCRImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
df["j_alleles"] = MiXCRImport._load_alleles(df, "j_alleles")

ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)
ImportHelper.update_gene_info(df)
ImportHelper.load_chains(df)

Expand Down
2 changes: 1 addition & 1 deletion immuneML/IO/dataset_import/OLGAImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
df["sequence_identifiers"] = None

ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)
ImportHelper.junction_to_cdr3(df, params.region_type)
df.loc[:, "region_types"] = params.region_type.name
ImportHelper.update_gene_info(df)
Expand Down
2 changes: 1 addition & 1 deletion immuneML/IO/dataset_import/TenxGenomicsImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
ImportHelper.junction_to_cdr3(df, params.region_type)
df.loc[:, "region_types"] = params.region_type.name
ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)
ImportHelper.update_gene_info(df)
ImportHelper.load_chains(df)

Expand Down
4 changes: 2 additions & 2 deletions immuneML/IO/dataset_import/VDJdbImport.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def preprocess_dataframe(df: pd.DataFrame, params: DatasetImportParams):
df.loc[df["sequence_identifiers"] == "0", "sequence_identifiers"] = None

ImportHelper.drop_empty_sequences(df, params.import_empty_aa_sequences, params.import_empty_nt_sequences)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters)
ImportHelper.drop_illegal_character_sequences(df, params.import_illegal_characters, params.import_with_stop_codon)
ImportHelper.update_gene_info(df)
ImportHelper.load_chains(df)

Expand Down Expand Up @@ -167,7 +167,7 @@ def get_sequence_identifiers(receptor_identifiers, chains):
return sequence_identifiers
else:
counts = sequence_identifiers.value_counts()
for id, count in counts[counts > 1].iteritems():
for id, count in counts[counts > 1].items():
unique_ids = [f"{id}{i}" for i in range(1, count+1)]
sequence_identifiers.loc[sequence_identifiers == id] = unique_ids
return sequence_identifiers
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
number_of_processes: 4
metrics: null
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
min_len: 5
max_len: -1
sequence_type: amino_acid
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imgt_positions: True
relative_frequency: True
split_by_label: False
label: null
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
color_by_label: None
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sequence_type: amino_acid
16 changes: 16 additions & 0 deletions immuneML/data_model/dataset/RepertoireDataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,26 @@
from immuneML.data_model.repertoire.Repertoire import Repertoire
from immuneML.environment.Constants import Constants
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder


class RepertoireDataset(Dataset):

@classmethod
def build_from_objects(cls, **kwargs):
ParameterValidator.assert_keys_present(list(kwargs.keys()), ['repertoires', 'path'], RepertoireDataset.__name__, RepertoireDataset.__name__)
ParameterValidator.assert_all_type_and_value(kwargs['repertoires'], Repertoire, RepertoireDataset.__name__, 'repertoires')

metadata_df = pd.DataFrame.from_records([{**rep.metadata, **{'filename': rep.data_filename}} for rep in kwargs['repertoires']])

if 'field_list' in metadata_df.columns:
metadata_df.drop(columns=['field_list'], inplace=True)

metadata_path = PathBuilder.build(kwargs['path']) / 'metadata.csv'
metadata_df.to_csv(metadata_path, index=False)

return RepertoireDataset(repertoires=kwargs['repertoires'], metadata_file=metadata_path)

@classmethod
def build(cls, **kwargs):
ParameterValidator.assert_keys_present(list(kwargs.keys()), ['metadata_file', 'name', 'repertoire_ids', 'metadata_fields'],
Expand Down
3 changes: 1 addition & 2 deletions immuneML/data_model/receptor/BCKReceptor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from uuid import uuid4

from immuneML.data_model.receptor.Receptor import Receptor
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
Expand Down Expand Up @@ -34,7 +33,7 @@ def get_record_names(cls):
+ [name for name in cls.FIELDS if name not in ['heavy', 'kappa']]

def __init__(self, heavy: ReceptorSequence = None, kappa: ReceptorSequence = None, metadata: dict = None, identifier: str = None):
super().__init__(metadata, identifier)
super().__init__(metadata=metadata, identifier=identifier)
self.heavy = heavy
self.kappa = kappa

Expand Down
3 changes: 1 addition & 2 deletions immuneML/data_model/receptor/BCReceptor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from uuid import uuid4

from immuneML.data_model.receptor.Receptor import Receptor
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
Expand Down Expand Up @@ -34,7 +33,7 @@ def get_record_names(cls):

def __init__(self, heavy: ReceptorSequence = None, light: ReceptorSequence = None, metadata: dict = None,
identifier: str = None):
super().__init__(metadata, identifier)
super().__init__(metadata=metadata, identifier=identifier)
self.heavy = heavy
self.light = light

Expand Down
4 changes: 2 additions & 2 deletions immuneML/data_model/receptor/ReceptorBuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from immuneML.data_model.receptor.TCABReceptor import TCABReceptor
from immuneML.data_model.receptor.TCGDReceptor import TCGDReceptor
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequenceList import ReceptorSequenceList
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence


class ReceptorBuilder:
Expand All @@ -29,7 +29,7 @@ def build_object(cls, sequences: dict, identifier: str = None, metadata: dict =
return None

@classmethod
def build_objects(cls, sequences: ReceptorSequenceList) -> List[Receptor]:
def build_objects(cls, sequences: List[ReceptorSequence]) -> List[Receptor]:
receptors = []
sequences_per_chain = {chain.value: [sequence for sequence in sequences if sequence.metadata.chain.value == chain.value]
for chain in Chain}
Expand Down
3 changes: 1 addition & 2 deletions immuneML/data_model/receptor/TCABReceptor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from uuid import uuid4

import numpy as np

Expand Down Expand Up @@ -29,7 +28,7 @@ def create_from_record(cls, record: np.void):
raise NotImplementedError(f"Supported ({TCABReceptor.version}) and available version differ, but there is no converter available.")

def __init__(self, alpha: ReceptorSequence = None, beta: ReceptorSequence = None, metadata: dict = None, identifier: str = None):
super().__init__(metadata, identifier)
super().__init__(metadata=metadata, identifier=identifier)
self.alpha = alpha
self.beta = beta

Expand Down
3 changes: 1 addition & 2 deletions immuneML/data_model/receptor/TCGDReceptor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from uuid import uuid4

import numpy as np

Expand Down Expand Up @@ -35,7 +34,7 @@ def get_record_names(cls):
+ [name for name in cls.FIELDS if name not in ['gamma', 'delta']]

def __init__(self, gamma: ReceptorSequence = None, delta: ReceptorSequence = None, metadata: dict = None, identifier: str = None):
super().__init__(metadata, identifier)
super().__init__(metadata=metadata, identifier=identifier)
self.gamma = gamma
self.delta = delta

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# quality: gold
import json
from uuid import uuid4

import numpy as np

Expand Down Expand Up @@ -35,12 +36,12 @@ def __init__(self,
nucleotide_sequence: str = None,
identifier: str = None,
annotation: SequenceAnnotation = None,
metadata: SequenceMetadata = SequenceMetadata()):
self.identifier = identifier
metadata: SequenceMetadata = None):
self.identifier = identifier if identifier is not None and identifier != "" else uuid4().hex
self.amino_acid_sequence = amino_acid_sequence
self.nucleotide_sequence = nucleotide_sequence
self.annotation = annotation
self.metadata = metadata
self.metadata = metadata if metadata is not None else SequenceMetadata()

def __repr__(self):
return f"ReceptorSequence(sequence_aa={self.amino_acid_sequence}, sequence={self.nucleotide_sequence}, " \
Expand Down

This file was deleted.

39 changes: 20 additions & 19 deletions immuneML/data_model/repertoire/Repertoire.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from immuneML.data_model.receptor.RegionType import RegionType
from immuneML.data_model.receptor.receptor_sequence.Chain import Chain
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequence import ReceptorSequence
from immuneML.data_model.receptor.receptor_sequence.ReceptorSequenceList import ReceptorSequenceList
from immuneML.data_model.receptor.receptor_sequence.SequenceAnnotation import SequenceAnnotation
from immuneML.data_model.receptor.receptor_sequence.SequenceMetadata import SequenceMetadata
from immuneML.environment.EnvironmentSettings import EnvironmentSettings
Expand Down Expand Up @@ -112,24 +111,21 @@ def build(cls, sequence_aas: list = None, sequences: list = None, v_genes: list

@classmethod
def build_like(cls, repertoire, indices_to_keep: list, result_path: Path, filename_base: str = None):
if indices_to_keep is not None and len(indices_to_keep) > 0:
PathBuilder.build(result_path)
PathBuilder.build(result_path)

data = repertoire.load_data()
data = data[indices_to_keep]
identifier = uuid4().hex
filename_base = filename_base if filename_base is not None else identifier
data = repertoire.load_data()
data = data[indices_to_keep]
identifier = uuid4().hex
filename_base = filename_base if filename_base is not None else identifier

data_filename = result_path / f"{filename_base}.npy"
np.save(str(data_filename), data)
data_filename = result_path / f"{filename_base}.npy"
np.save(str(data_filename), data)

metadata_filename = result_path / f"{filename_base}_metadata.yaml"
shutil.copyfile(repertoire.metadata_filename, metadata_filename)
metadata_filename = result_path / f"{filename_base}_metadata.yaml"
shutil.copyfile(repertoire.metadata_filename, metadata_filename)

new_repertoire = Repertoire(data_filename, metadata_filename, identifier)
return new_repertoire
else:
return None
new_repertoire = Repertoire(data_filename, metadata_filename, identifier)
return new_repertoire

@classmethod
def build_from_sequence_objects(cls, sequence_objects: list, path: Path, metadata: dict, filename_base: str = None):
Expand Down Expand Up @@ -241,10 +237,15 @@ def get_attributes(self, attributes: list):
return result

def get_region_type(self):
region_types = set(self.get_attribute("region_types"))
assert len(region_types) == 1, f"Repertoire: expected one region_type, found: {region_types}"
region_types = self.get_attribute("region_types")
if region_types is not None:
region_types = set(region_types)
assert len(region_types) == 1, f"Repertoire {self.identifier}: expected one region_type, found: {region_types}"

return RegionType(region_types.pop())
return RegionType(region_types.pop())
else:
logging.warning(f'Repertoire {self.identifier}: region_types are not set for sequences.')
return None

def free_memory(self):
self.data = None
Expand Down Expand Up @@ -309,7 +310,7 @@ def _prepare_cell_lists(self):
return same_cell_lists

def _make_receptors(self, cell_content):
sequences = ReceptorSequenceList()
sequences = []
for item in cell_content:
sequences.append(self._make_sequence_object(item))
return ReceptorBuilder.build_objects(sequences)
Expand Down
21 changes: 17 additions & 4 deletions immuneML/dsl/instruction_parsers/MLApplicationParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from immuneML.environment.Label import Label
from immuneML.environment.LabelConfiguration import LabelConfiguration
from immuneML.hyperparameter_optimization.HPSetting import HPSetting
from immuneML.ml_metrics.Metric import Metric
from immuneML.util.ParameterValidator import ParameterValidator
from immuneML.util.PathBuilder import PathBuilder
from immuneML.workflows.instructions.ml_model_application.MLApplicationInstruction import MLApplicationInstruction
Expand All @@ -25,22 +26,34 @@ class MLApplicationParser:
type: MLApplication
dataset: d1
config_path: ./config.zip
metrics:
- accuracy
- precision
- recall
number_of_processes: 4
label: CD
"""

def parse(self, key: str, instruction: dict, symbol_table: SymbolTable, path: Path) -> MLApplicationInstruction:
location = MLApplicationParser.__name__
ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'number_of_processes', 'config_path'], location, key)
ParameterValidator.assert_keys(instruction.keys(), ['type', 'dataset', 'number_of_processes', 'config_path', 'metrics'], location, key)
ParameterValidator.assert_in_valid_list(instruction['dataset'], symbol_table.get_keys_by_type(SymbolType.DATASET), location, f"{key}: dataset")
ParameterValidator.assert_type_and_value(instruction['number_of_processes'], int, location, f"{key}: number_of_processes", min_inclusive=1)
ParameterValidator.assert_type_and_value(instruction['config_path'], str, location, f'{key}: config_path')

if 'metrics' in instruction and instruction['metrics'] is not None:
ParameterValidator.assert_type_and_value(instruction['metrics'], list, location, f'{key}: metrics')
metrics = [Metric.get_metric(metric) for metric in instruction["metrics"]]
else:
metrics = []

hp_setting, label = self._parse_hp_setting(instruction, path, key)

instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key, number_of_processes=instruction['number_of_processes'],
label_configuration=LabelConfiguration([label]), hp_setting=hp_setting)
instruction = MLApplicationInstruction(dataset=symbol_table.get(instruction['dataset']), name=key,
number_of_processes=instruction['number_of_processes'],
label_configuration=LabelConfiguration([label]),
hp_setting=hp_setting,
metrics=metrics)

return instruction

Expand Down
Loading

0 comments on commit 0991c63

Please sign in to comment.