Skip to content

Commit

Permalink
MSFragger Results Folder Reader (#792)
Browse files Browse the repository at this point in the history
* created interface IQuantifiable

* created IQuantifiableRecord and IQuantifiable interfaces, deleted origional IQuantifiable in FlashLFQ

* created IdentificationAdapter class and edited IquantifiableRecord interface

* added implementation of IQuantifiableRecord to MSFraggerPSM abd wrote tests

* created test for IdentificationAdapter method

* created Dictionary linking MSFragger file names to their corresponding full file paths

* created outline for MsFraggerCombinedResults class

* finished MsFraggerCombinedResults and wrote test

* finished writing tests for MSFraggerCombinedResults

* edited big files

* edited FileNameToFilePath method in MSFraggerPsm to account for MSFragger file name additions and wrote tests

* edited MsFraggerPsm to account for differences in result files (probabilty v peptideprophetprobability)

* removed unneeded test from TestQuantifiedPeaks

* made recommended changes by reviewers

* made recommended changes

* made recommended changes

* added comments, made changes to FileNameToFilePath, and wrote test for FileNameToFilePath method w/out parameter

* file changed

* added experiment_annotation.tsv file to supported file types

* added test for FileNameToFilePath method with parameter

* wrote test for Experiment annotation file WriteResults

* resolved merge conflicts pt.2

* made recommended changes

* deleted comment

---------

Co-authored-by: trishorts <[email protected]>
Co-authored-by: Nic Bollis <[email protected]>
  • Loading branch information
3 people authored Oct 30, 2024
1 parent 7dcf9a9 commit cb08d67
Show file tree
Hide file tree
Showing 17 changed files with 420 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ public interface IQuantifiableResultFile : IResultFile
/// </summary>
/// <param name="fullFilePath"> list of file paths associated with each distinct record </param>
/// <returns> Dictionary of file names and their associted full paths </returns>
public Dictionary<string, string> FileNametoFilePath(List<string> fullFilePath);
public Dictionary<string, string> FileNameToFilePath(List<string> fullFilePath);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using CsvHelper.Configuration;
using CsvHelper.Configuration.Attributes;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Readers
{
/// <summary>
/// A class representing a single entry in an experiment_annotation.tsv file
/// </summary>
public class ExperimentAnnotation
{
public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture)
{
Delimiter = "\t",
HasHeaderRecord = true,
IgnoreBlankLines = true,
TrimOptions = TrimOptions.Trim
};

#region experiment_annotation Fields

[Name("file")]
public string File { get; set; }

[Name("sample")]
public string Sample { get; set; }

[Name("sample_name")]
public string SampleName { get; set; }

[Name("condition")]
public string Condition { get; set; }

[Name("replicate")]
public string Replicate { get; set; }

#endregion
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using CsvHelper;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Readers
{
/// <summary>
/// Concrete Product for reading and representing a experiment annotation file
/// </summary>
public class ExperimentAnnotationFile: ResultFile<ExperimentAnnotation>, IResultFile
{
public override SupportedFileType FileType => SupportedFileType.ExperimentAnnotation;

public override Software Software { get; set; }

public ExperimentAnnotationFile(string filePath) : base(filePath, Software.MsFragger) { }

/// <summary>
/// Constructor used to initialize from the factory method
/// </summary>
public ExperimentAnnotationFile() : base() { }

/// <summary>
/// Load Results to the Results List from the given filepath
/// </summary>
public override void LoadResults()
{
using var csv = new CsvReader(new StreamReader(FilePath), ExperimentAnnotation.CsvConfiguration);
Results = csv.GetRecords<ExperimentAnnotation>().ToList();
}

/// <summary>
/// Writes results to a specific output path
/// </summary>
/// <param name="outputPath">destination path</param>
public override void WriteResults(string outputPath)
{
if (!CanRead(outputPath))
outputPath += FileType.GetFileExtension();

using var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), ExperimentAnnotation.CsvConfiguration);

csv.WriteHeader<ExperimentAnnotation>();
foreach (var result in Results)
{
csv.NextRecord();
csv.WriteRecord(result);
}
}
}
}
171 changes: 171 additions & 0 deletions mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
using CsvHelper;
using Readers.ExternalResults.BaseClasses;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using MathNet.Numerics;

namespace Readers
{
public class MsFraggerCombinedResults : ResultFile<MsFraggerPsm>, IResultFile, IQuantifiableResultFile
{
#region Properties/Fields

public string FullFolderPath => FilePath; // The full file path to the folder of MSFragger results
private List<string> allPsmFilePaths; // List of the full file paths to the psm files of every sample

// A list of all the MSFraggerPsmFile objects that correspond to each sample within an experiment
public List<MsFraggerPsmFile> AllPsmFiles { get; private set; }

// Contains descriptive information on every ms data file in the experiment (sample name, full path to the ms data file, etc.)
public ExperimentAnnotationFile ExperimentAnnotations { get; private set; }

#endregion

#region IResultFile Implementatation

public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm;
public override Software Software { get; set; }
public MsFraggerCombinedResults(string filePath) : base(filePath, Software.MsFragger) { }

/// <summary>
/// Loads the results from each psm.tsv file in the results folder, builds one list of MsFraggerPsms,
/// and Calls LoadExperimentAnnotation, FindAllFilePaths, LoadPsmResults,
/// then selects every results from each MsFraggerPsmFile in AllPsmFiles and writes them to one concatenated list.
/// </summary>
public override void LoadResults()
{
LoadExperimentAnnotationResults();
FindAllFilePaths();
LoadPsmResults();

List<MsFraggerPsm> concatList = new List<MsFraggerPsm>();
foreach (var file in AllPsmFiles)
{
concatList.AddRange(file);
}

Results = concatList;
}

public override void WriteResults(string outputPath)
{
throw new NotImplementedException("Method not yet implemented.");
}

#endregion

/// <summary>
/// Checks for existence of experiment annotation file and loads its it as an ExperimentAnnotationResultFile,
/// then sets the ExperimentAnnotations property
/// </summary>
/// <exception cref="FileNotFoundException"></exception>
public void LoadExperimentAnnotationResults()
{
string combinedFilePath = Path.Combine(FullFolderPath, "experiment_annotation.tsv");
if (!File.Exists(combinedFilePath)) { throw new FileNotFoundException("The experiment_annotation.tsv file was not found"); }

ExperimentAnnotations = new ExperimentAnnotationFile(combinedFilePath);
}

/// <summary>
/// For each path in AllPsmFilePaths, creates and loads an MsFraggerPsmFile.
/// Then constructs the AllPsmFiles list
/// </summary>
public void LoadPsmResults()
{
AllPsmFiles = new List<MsFraggerPsmFile>();

foreach(var path in allPsmFilePaths)
{
MsFraggerPsmFile file = new MsFraggerPsmFile(path);
AllPsmFiles.Add(file);
}
}

public IEnumerable<IQuantifiableRecord> GetQuantifiableResults() => Results;

/// <summary>
/// Links the file name associated with the an IQuantifiableRecord
/// to the raw file path of MassSpec data in the fullFilePath list
/// </summary>
/// <param name="filePaths"> list of file paths associated with each distinct record </param>
/// <returns> Dictionary of file names and their associted full paths </returns>
public Dictionary<string, string> FileNameToFilePath(List<string> filePaths)
{
Dictionary<string, string> allFiles = new Dictionary<string, string>();

allFiles = AllPsmFiles.Select(file => file.FileNameToFilePath(filePaths))
.SelectMany(dictionary => dictionary)
.GroupBy(x => x.Key)
.Select(keyValuePair => keyValuePair.First())
.ToDictionary(fileName => fileName.Key, filePath => filePath.Value);

return allFiles;
}

/// <summary>
/// Links the file name associated with IQuantifiableRecord to the raw file path pf MassSpec file
/// using the full file paths from the experiment annotation file.
/// </summary>
/// <returns> Dictionary of file names and their associted full paths </returns>
public Dictionary<string, string> FileNameToFilePath()
{
List<string> filePaths = ExperimentAnnotations.Select(psm => psm.File).Distinct().ToList();
List<string> fileNames = Results.Select(psm => psm.FileName).Distinct().ToList();
Dictionary<string, string> allFiles = new Dictionary<string, string>();

foreach (var name in fileNames)
{
string fileName = Path.GetFileName(name);

// MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml
// In order to correctly match the file names, these changes must be removed
fileName = fileName.Replace("interact-", "").Replace(".pep.xml", "");

foreach (var path in filePaths)
{
if (path.Contains(fileName) && !allFiles.ContainsKey(name))
{
allFiles.Add(name, path);
break;
}
}
}

return allFiles;
}

/// <summary>
/// Uses the ExperimentAnnotations to locate each psm.tsv file in the results folder.
/// Adds the path to each psm.tsv file in the results folder to AllPsmFilePaths
/// </summary>
/// <exception cref="FileNotFoundException"></exception>
private void FindAllFilePaths()
{
allPsmFilePaths = new List<string>();

List<string> sampleNames = ExperimentAnnotations.Select(psm => psm.SampleName).Distinct().ToList();
string[] directoryEntries = Directory.GetDirectories(FullFolderPath);

foreach (var directoryEntry in directoryEntries)
{
string directoryName = Path.GetFileName(directoryEntry.TrimEnd(Path.DirectorySeparatorChar));

foreach (var sample in sampleNames)
{
if (directoryName.Equals(sample))
{
string psmFile = Path.Combine(directoryEntry, "psm.tsv");
if (!File.Exists(psmFile)) { throw new FileNotFoundException("This psm.tsv file was not found"); }

allPsmFilePaths.Add(psmFile);
}
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ public override void WriteResults(string outputPath)
/// </summary>
/// <param name="fullFilePath"> list of all full file paths associted with a given result </param>
/// <returns> dictionary with key fileName and value fullFilePath </returns>
public Dictionary<string, string> FileNametoFilePath (List<string> fullFilePath)
public Dictionary<string, string> FileNameToFilePath (List<string> fullFilePath)
{
List<string> rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList();
fullFilePath = fullFilePath.Distinct().ToList();
Expand Down
6 changes: 5 additions & 1 deletion mzLib/Readers/Util/SupportedFileTypes.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public enum SupportedFileType
MsPathFinderTTargets,
MsPathFinderTDecoys,
MsPathFinderTAllResults,
CruxResult
CruxResult,
ExperimentAnnotation
}

public static class SupportedFileTypeExtensions
Expand Down Expand Up @@ -64,6 +65,7 @@ public static string GetFileExtension(this SupportedFileType type)
SupportedFileType.MsPathFinderTDecoys => "_IcDecoy.tsv",
SupportedFileType.MsPathFinderTAllResults => "_IcTDA.tsv",
SupportedFileType.CruxResult => ".txt",
SupportedFileType.ExperimentAnnotation => "experiment_annotation.tsv",
_ => throw new MzLibException("File type not supported")
};
}
Expand Down Expand Up @@ -116,6 +118,8 @@ public static SupportedFileType ParseFileType(this string filePath)
return SupportedFileType.MsPathFinderTDecoys;
if (filePath.EndsWith(SupportedFileType.MsPathFinderTAllResults.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.MsPathFinderTAllResults;
if(filePath.EndsWith(SupportedFileType.ExperimentAnnotation.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase))
return SupportedFileType.ExperimentAnnotation;

// these tsv cases are just .tsv and need an extra step to determine the type
// currently need to distinguish between FlashDeconvTsv and MsFraggerPsm
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00906.00906.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VKEDPDGEHAR SISGRPIK.VKEDPDGEHAR.RAMQKVMA K R 11 3 2111.248 1251.5845 1251.5914 418.2021 418.2044 1251.5842 418.202 0.0072 0.05469976 15.518 11.386 0.8908 2 1 144 154 208463.97 0 FALSE sp|P52272|HNRPM_HUMAN P52272 HNRPM_HUMAN HNRNPM Heterogeneous nuclear ribonucleoprotein M "tr|M0QYQ7|M0QYQ7_HUMAN, tr|M0R019|M0R019_HUMAN, tr|M0R0N3|M0R0N3_HUMAN, tr|M0R2T0|M0R2T0_HUMAN"
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00917.00917.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml NEEDEGHSNSSPR GAKIDASK.NEEDEGHSNSSPR.HSEAATAQ K H 13 3 2113.596 1456.5808 1456.5822 486.5342 486.5347 1456.5814 486.5344 0.0007 0.007893147 17.911 0 1 2 0 73 85 349264.44 0 FALSE sp|Q14103|HNRPD_HUMAN Q14103 HNRPD_HUMAN HNRNPD Heterogeneous nuclear ribonucleoprotein D0 "tr|A0A994J4B1|A0A994J4B1_HUMAN, tr|A0A994J4R1|A0A994J4R1_HUMAN, tr|D6RAF8|D6RAF8_HUMAN, tr|D6RD83|D6RD83_HUMAN"
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00947.00947.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VGQADDSTKPTNK IGSFSGIR.VGQADDSTKPTNK.ASSTSITS R A 13 3 2120.3625 1359.6602 1359.6622 454.2273 454.228 1359.663 454.2283 -0.0007 0.001409289 12.904 0 0.9994 2 1 1339 1351 171548.62 0 FALSE sp|P35658|NU214_HUMAN P35658 NU214_HUMAN NUP214 Nuclear pore complex protein Nup214 "tr|A0A494C1F2|A0A494C1F2_HUMAN, tr|A0A8Q3SHZ4|A0A8Q3SHZ4_HUMAN, tr|B7ZAV2|B7ZAV2_HUMAN, tr|E9PKD2|E9PKD2_HUMAN, tr|H0Y837|H0Y837_HUMAN"
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.01021.01021.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml AEQEAEEPRK IAERARIK.AEQEAEEPRK.THSEEFTN K T 10 3 2136.586 1185.5615 1185.5641 396.1944 396.1953 1185.5625 396.1948 0.0015 0.151182 10.782 0 0.9548 2 1 106 115 125972.164 0 FALSE sp|Q9H788|SH24A_HUMAN Q9H788 SH24A_HUMAN SH2D4A SH2 domain-containing protein 4A tr|H0YAT1|H0YAT1_HUMAN
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01005.01005.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml HAVSEGTK IIPGEIAK.HAVSEGTK.AVTKYTSA K A 8 2 1938.0153 827.4154 827.4152 414.715 414.7149 827.4137 414.7141 0.0015 1.57772E-05 21.993 14.41 0.9994 2 0 110 117 8907404 0 FALSE sp|O60814|H2B1K_HUMAN O60814 H2B1K_HUMAN H2BC12 Histone H2B type 1-K "H2BC1, H2BC11, H2BC12L, H2BC13, H2BC14, H2BC15, H2BC17, H2BC18, H2BC21, H2BC26, H2BC3, H2BC4, H2BC5, H2BC9, H2BK1" "sp|A0A2R8Y619|H2BK1_HUMAN, sp|P06899|H2B1J_HUMAN, sp|P23527|H2B1O_HUMAN, sp|P33778|H2B1B_HUMAN, sp|P57053|H2BFS_HUMAN, sp|P58876|H2B1D_HUMAN, sp|P62807|H2B1C_HUMAN, sp|Q16778|H2B2E_HUMAN, sp|Q5QNW6|H2B2F_HUMAN, sp|Q8N257|H2B3B_HUMAN, sp|Q93079|H2B1H_HUMAN, sp|Q96A08|H2B1A_HUMAN, sp|Q99877|H2B1N_HUMAN, sp|Q99879|H2B1M_HUMAN, sp|Q99880|H2B1L_HUMAN, tr|U3KQK0|U3KQK0_HUMAN"
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01551.01551.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml YDSTHGR DYAAYMFK.YDSTHGR.YAGEVSHD K Y 7 2 1976.3535 834.3639 834.3626 418.1892 418.1886 834.362 418.1883 0.0005 0.01685582 15.593 0 0.9997 2 0 47 53 1.96E+07 0 FALSE sp|P00359|G3P3_YEAST P00359 G3P3_YEAST TDH3 Glyceraldehyde-3-phosphate dehydrogenase 3 "GAPDHS, TDH1, TDH2" "sp|O14556|G3PT_HUMAN, sp|P00358|G3P2_YEAST, sp|P00360|G3P1_YEAST, tr|K7EP73|K7EP73_HUMAN"
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01565.01565.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml AESSQTCHSEQGDK AESSQTCHSEQGDK KSTQNSFR.AESSQTCHSEQGDK.KMEEKNSG R K 14 3 1977.164 1562.6292 1562.6265 521.8837 521.8828 1562.6267 521.8828 -0.0002 8.61986E-05 27.758 10.626 1 2 0 600 613 4579535.5 7C(57.0215) 0 TRUE sp|P46063|RECQ1_HUMAN P46063 RECQ1_HUMAN RECQL ATP-dependent DNA helicase Q1
Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01607.01607.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml VTSTGRPGHASR ERSPWWVR.VTSTGRPGHASR.FMEDTAAE R F 12 3 1979.2815 1224.6323 1224.6318 409.218 409.2179 1224.6322 409.218 -0.0003 0.9328038 10.648 0 0.9687 2 1 198 209 1587950.4 0 FALSE sp|Q03154|ACY1_HUMAN Q03154 ACY1_HUMAN ACY1 Aminoacylase-1 ABHD14A-ACY1 "tr|A0A1B0GU86|A0A1B0GU86_HUMAN, tr|A0A1B0GV31|A0A1B0GV31_HUMAN, tr|A0A1B0GVA5|A0A1B0GVA5_HUMAN, tr|A0A1B0GW23|A0A1B0GW23_HUMAN, tr|C9JMV9|C9JMV9_HUMAN, tr|C9JYZ0|C9JYZ0_HUMAN"
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
file sample sample_name condition replicate
E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.raw A_1 A_1 A 1
E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.raw A_2 A_2 A 2
Loading

0 comments on commit cb08d67

Please sign in to comment.