diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 786ff7d..b7e70e8 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -19,7 +19,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.10" - name: Install dependencies run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e02148f..ae65341 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,25 +13,30 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.10" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install ruff + - name: Lint with Ruff + uses: astral-sh/ruff-action@v3 + with: + args: check --exclude docs,tests - - name: Check with Ruff - run: ruff check --output-format=github . + - name: Check formatting with Ruff + uses: astral-sh/ruff-action@v3 + with: + args: format --check --diff --exclude docs,tests - name: Install package and its dependencies run: pip install --editable .[dev,idxml] + - name: Static type checking with mypy + run: mypy --non-interactive + - name: Test with pytest and codecov run: | - pytest --cov=psm_utils --cov-report=xml tests/ + pytest --cov=psm_utils --cov-report=xml tests/ - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v3 @@ -46,7 +51,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 diff --git a/online/Home.py b/online/Home.py index 1c1f8fc..f6eb7f3 100644 --- a/online/Home.py +++ b/online/Home.py @@ -4,6 +4,8 @@ class StreamlitPageHome(StreamlitPage): + """Streamlit page for the home section.""" + def _main_page(self): pass diff --git a/online/_utils.py b/online/_utils.py index a69b26e..b743694 100644 --- a/online/_utils.py +++ b/online/_utils.py @@ -11,6 +11,7 @@ class ECDF: ---------- x : array_like Observations + """ def __init__(self, x): @@ -101,15 +102,15 @@ def pp_plot(psm_df): def fdr_plot(psm_df, fdr_threshold): """Plot number of identifications in function of FDR threshold.""" - df = ( + target_psm_df = ( psm_df[~psm_df["is_decoy"]] .reset_index(drop=True) .sort_values("qvalue", ascending=True) .copy() ) - df["count"] = (~df["is_decoy"]).cumsum() + target_psm_df["count"] = (~target_psm_df["is_decoy"]).cumsum() fig = px.line( - df, + target_psm_df, x="qvalue", y="count", log_x=True, diff --git a/psm_utils/__main__.py b/psm_utils/__main__.py index db273b6..c74bbe2 100644 --- a/psm_utils/__main__.py +++ b/psm_utils/__main__.py @@ -32,6 +32,7 @@ def main(): + """Run the main entry point for the psm_utils CLI.""" logging.basicConfig( level="NOTSET", format="%(message)s", @@ -47,6 +48,7 @@ def main(): @click.group() def cli(): + """Command line interface for psm_utils.""" pass diff --git a/psm_utils/io/__init__.py b/psm_utils/io/__init__.py index ac309e2..295b194 100644 --- a/psm_utils/io/__init__.py +++ b/psm_utils/io/__init__.py @@ -1,10 +1,46 @@ -"""Parsers for proteomics search results from various search engines.""" +""" +Parsers for proteomics search results from various search engines. + +This module provides a unified interface for reading and writing peptide-spectrum match (PSM) +files from various proteomics search engines and analysis tools. It supports automatic file +type detection and conversion between different formats. + +The module includes: + +- Reader and writer classes for various PSM file formats +- Automatic file type inference from filename patterns +- File conversion utilities +- Progress tracking for long operations +- Type-safe interfaces with comprehensive error handling + +Supported file formats include MaxQuant, MS²PIP, Percolator, mzIdentML, pepXML, and many others. +See the documentation for a complete list of supported formats. + +Examples +-------- +Read a PSM file with automatic format detection: + +>>> from psm_utils.io import read_file +>>> psm_list = read_file("results.tsv") + +Convert between file formats: + +>>> from psm_utils.io import convert +>>> convert("input.msms", "output.mzid") + +Write a PSMList to file: + +>>> from psm_utils.io import write_file +>>> write_file(psm_list, "output.tsv") + +""" from __future__ import annotations import re from pathlib import Path from tempfile import NamedTemporaryFile +from typing import Protocol, TypedDict, runtime_checkable from rich.progress import track @@ -26,12 +62,22 @@ import psm_utils.io.sage as sage import psm_utils.io.tsv as tsv import psm_utils.io.xtandem as xtandem -from psm_utils.io._base_classes import WriterBase +from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -FILETYPES = { + +class FileType(TypedDict): + """Type definition for filetype properties.""" + + reader: type[ReaderBase] | None + writer: type[WriterBase] | None + extension: str + filename_pattern: str + + +FILETYPES: dict[str, FileType] = { "flashlfq": { "reader": flashlfq.FlashLFQReader, "writer": flashlfq.FlashLFQWriter, @@ -150,12 +196,24 @@ FILETYPES["sage"] = FILETYPES["sage_tsv"] # Alias for backwards compatibility -READERS = {k: v["reader"] for k, v in FILETYPES.items() if v["reader"]} -WRITERS = {k: v["writer"] for k, v in FILETYPES.items() if v["writer"]} +# Type-annotated lookup dictionaries for readers and writers +READERS: dict[str, type[ReaderBase]] = { + k: v["reader"] for k, v in FILETYPES.items() if v["reader"] +} +WRITERS: dict[str, type[WriterBase]] = { + k: v["writer"] for k, v in FILETYPES.items() if v["writer"] +} + + +@runtime_checkable +class _SupportsStr(Protocol): + """Protocol to check if an object supports string conversion.""" + def __str__(self) -> str: ... -def _infer_filetype(filename: str): - """Infer filetype from filename.""" + +def _infer_filetype(filename: _SupportsStr) -> str: + """Infer filetype from filename using pattern matching.""" for filetype, properties in FILETYPES.items(): if re.fullmatch(properties["filename_pattern"], str(filename), flags=re.IGNORECASE): return filetype @@ -163,7 +221,7 @@ def _infer_filetype(filename: str): raise PSMUtilsIOException("Could not infer filetype.") -def _supports_write_psm(writer: WriterBase): +def _supports_write_psm(writer: type[WriterBase]) -> bool: """Check if writer supports write_psm method.""" with NamedTemporaryFile(delete=False) as temp_file: temp_file.close() @@ -182,21 +240,32 @@ def _supports_write_psm(writer: WriterBase): return supports_write_psm -def read_file(filename: str | Path, *args, filetype: str = "infer", **kwargs): +def read_file(filename: str | Path, *args, filetype: str = "infer", **kwargs) -> PSMList: """ Read PSM file into :py:class:`~psm_utils.psmlist.PSMList`. Parameters ---------- - filename: str - Path to file. - filetype: str, optional - File type. Any PSM file type with read support. See psm_utils tag in - :ref:`Supported file formats`. - *args : tuple - Additional arguments are passed to the :py:class:`psm_utils.io` reader. - **kwargs : dict, optional - Additional keyword arguments are passed to the :py:class:`psm_utils.io` reader. + filename + Path to the PSM file to read. + filetype + File type specification. Can be any PSM file type with read support or "infer" to + automatically detect from filename pattern. See documentation for supported file formats. + *args + Additional positional arguments passed to the PSM file reader. + **kwargs + Additional keyword arguments passed to the PSM file reader. + + Returns + ------- + List of PSM objects parsed from the input file. + + Raises + ------ + PSMUtilsIOException + If filetype cannot be inferred or if the specified filetype is + unknown or not supported for reading. + """ if filetype == "infer": filetype = _infer_filetype(filename) @@ -218,25 +287,34 @@ def write_file( filetype: str = "infer", show_progressbar: bool = False, **kwargs, -): +) -> None: """ Write :py:class:`~psm_utils.psmlist.PSMList` to PSM file. Parameters ---------- - psm_list: PSMList - PSM list to be written. - filename: str - Path to file. - filetype: str, optional - File type. Any PSM file type with read support. See psm_utils tag in - :ref:`Supported file formats`. - show_progressbar: bool, optional - Show progress bar for conversion process. (default: False) - *args : tuple - Additional arguments are passed to the :py:class:`psm_utils.io` writer. - **kwargs : dict, optional - Additional keyword arguments are passed to the :py:class:`psm_utils.io` writer. + psm_list + List of PSM objects to be written to file. + filename + Path to the output file. + filetype + File type specification. Can be any PSM file type with write support or "infer" to + automatically detect from filename pattern. See documentation for supported file formats. + show_progressbar + Whether to display a progress bar during the writing process. + *args + Additional positional arguments passed to the PSM file writer. + **kwargs + Additional keyword arguments passed to the PSM file writer. + + Raises + ------ + PSMUtilsIOException + If filetype cannot be inferred or if the specified filetype is + unknown or not supported for writing. + IndexError + If psm_list is empty and cannot provide an example PSM. + """ if filetype == "infer": filetype = _infer_filetype(filename) @@ -270,29 +348,37 @@ def convert( input_filetype: str = "infer", output_filetype: str = "infer", show_progressbar: bool = False, -): +) -> None: """ Convert a PSM file from one format into another. Parameters ---------- - input_filename: str - Path to input file. - output_filename: str - Path to output file. - input_filetype: str, optional - File type. Any PSM file type with read support. See psm_utils tag in - :ref:`Supported file formats`. - output_filetype: str, optional - File type. Any PSM file type with write support. See psm_utils tag in - :ref:`Supported file formats`. - show_progressbar: bool, optional - Show progress bar for conversion process. (default: False) - + input_filename + Path to the input PSM file. + output_filename + Path to the output PSM file. + input_filetype + Input file type specification. Can be any PSM file type with read support + or "infer" to automatically detect from filename pattern. + See documentation for supported file formats. + output_filetype + Output file type specification. Can be any PSM file type with write support + or "infer" to automatically detect from filename pattern. + See documentation for supported file formats. + show_progressbar + Whether to display a progress bar during the conversion process. + + Raises + ------ + PSMUtilsIOException + If input or output filetypes cannot be inferred, if the specified filetypes are + unknown or not supported, or if the input file is empty. + KeyError + If the specified filetype is not found in READERS or WRITERS dictionaries. Examples -------- - Convert a MaxQuant msms.txt file to a MS²PIP peprec file, while inferring the applicable file types from the file extensions: @@ -309,19 +395,23 @@ def convert( ... output_filetype="peprec" ... ) - Note that filetypes can only be inferred for select specific file names and/or - extensions, such as ``msms.txt`` or ``*.peprec``. + Notes + ----- + Filetypes can only be inferred for select specific file names and/or extensions, such as + ``msms.txt`` or ``*.peprec``. """ - # If needed, infer input and output filetypes if input_filetype == "infer": input_filetype = _infer_filetype(input_filename) if output_filetype == "infer": output_filetype = _infer_filetype(output_filename) - reader_cls = READERS[input_filetype] - writer_cls = WRITERS[output_filetype] + try: + reader_cls = READERS[input_filetype] + writer_cls = WRITERS[output_filetype] + except KeyError as e: + raise PSMUtilsIOException(f"Filetype '{e.args[0]}' unknown or not supported.") from e # Remove file if already exists to avoid appending: if Path(output_filename).is_file(): @@ -330,15 +420,20 @@ def convert( reader = reader_cls(input_filename) if _supports_write_psm(writer_cls): - # Setup iterator, potentially with progress bar - iterator = ( - track(reader, description="[green]Converting file") if show_progressbar else reader - ) + # Setup iterator, potentially with indeterminate progress bar + if show_progressbar: + # Use indeterminate progress tracking for lazy evaluation + iterator = track(reader, description="[green]Converting file") + else: + iterator = reader # Get example PSM and instantiate writer for psm in reader: example_psm = psm break + else: + raise PSMUtilsIOException("Input file is empty or does not contain valid PSMs.") + writer = writer_cls(output_filename, example_psm=example_psm, mode="write") # Convert diff --git a/psm_utils/io/_base_classes.py b/psm_utils/io/_base_classes.py index 60f9c19..d0646b0 100644 --- a/psm_utils/io/_base_classes.py +++ b/psm_utils/io/_base_classes.py @@ -3,6 +3,7 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections.abc import Iterator from pathlib import Path from psm_utils.psm import PSM @@ -12,6 +13,8 @@ class ReaderBase(ABC): """Abstract base class for PSM file readers.""" + filename: Path + def __init__( self, filename: str | Path, @@ -19,26 +22,32 @@ def __init__( **kwargs, ) -> None: """ - Reader for PSM file. + Initialize PSM file reader. Parameters ---------- - filename: str, pathlib.Path + filename : str or pathlib.Path Path to PSM file. + *args + Additional positional arguments for subclasses. + **kwargs + Additional keyword arguments for subclasses. """ super().__init__() - self.filename = Path(filename) - def __enter__(self): + def __enter__(self) -> ReaderBase: + """Enter context manager.""" return self - def __exit__(self, *args, **kwargs): + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Exit context manager.""" pass @abstractmethod - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: + """Iterate over the PSM file and return PSMs one-by-one.""" raise NotImplementedError() def read_file(self) -> PSMList: @@ -49,22 +58,39 @@ def read_file(self) -> PSMList: class WriterBase(ABC): """Abstract base class for PSM file writers.""" - def __init__(self, filename, *args, **kwargs): + filename: Path + + def __init__(self, filename: str | Path, *args, **kwargs) -> None: + """ + Initialize PSM file writer. + + Parameters + ---------- + filename : str or pathlib.Path + Path to output PSM file. + *args + Additional positional arguments for subclasses. + **kwargs + Additional keyword arguments for subclasses. + + """ super().__init__() self.filename = Path(filename) - def __enter__(self): + def __enter__(self) -> WriterBase: + """Enter context manager.""" return self - def __exit__(self, *args, **kwargs): + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Exit context manager.""" pass @abstractmethod - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """Write a single PSM to the PSM file.""" raise NotImplementedError() @abstractmethod - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """Write an entire PSMList to the PSM file.""" raise NotImplementedError() diff --git a/psm_utils/io/_pd_msf_tables.py b/psm_utils/io/_pd_msf_tables.py index a2323bd..fbbbdd4 100644 --- a/psm_utils/io/_pd_msf_tables.py +++ b/psm_utils/io/_pd_msf_tables.py @@ -1,242 +1,270 @@ -"""SQLAlchemy models for Mascot MSF files.""" +""" +SQLAlchemy ORM models for Proteome Discoverer MSF database files. + +This module provides SQLAlchemy table definitions for interfacing with Proteome Discoverer MSF +(Mascot Search Form) database files. MSF files contain proteomics search results including +peptide identifications, protein annotations, spectra metadata, and quantification data. + +The table definitions are auto-generated from MSF schema and follow SQLAlchemy 2.0 patterns +with proper typing support. + +Examples +-------- +>>> from psm_utils.io._pd_msf_tables import Base, Peptide +>>> # Use with SQLAlchemy session to query MSF database +>>> session.query(Peptide).filter(Peptide.ConfidenceLevel > 2).all() + +Notes +----- +These models are primarily used internally by the proteome_discoverer module for reading PSM +data from MSF files. + +""" + +from __future__ import annotations + +from datetime import datetime from sqlalchemy import ( CHAR, BigInteger, Boolean, - Column, DateTime, Float, Index, Integer, LargeBinary, + MetaData, SmallInteger, String, - Table, Text, UniqueConstraint, text, ) +from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column + + +class Base(DeclarativeBase): + """Base class for all MSF table models.""" -try: - from sqlalchemy.orm import declarative_base -except ImportError: - from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.sql.sqltypes import NullType + pass -Base = declarative_base() -metadata = Base.metadata + +# Module-level metadata reference for table definitions +metadata: MetaData = Base.metadata class AminoAcidModification(Base): __tablename__ = "AminoAcidModifications" - AminoAcidModificationID = Column(Integer, primary_key=True) - ModificationName = Column(String, nullable=False) - DeltaMass = Column(Float) - Substitution = Column(String) - LeavingGroup = Column(String) - Abbreviation = Column(String, nullable=False) - PositionType = Column(Integer, nullable=False) - IsActive = Column(Boolean) - DeltaAverageMass = Column(Float) - UnimodAccession = Column(String) - IsSubstitution = Column(Boolean, nullable=False, server_default=text("0")) + AminoAcidModificationID: Mapped[int] = mapped_column(Integer, primary_key=True) + ModificationName: Mapped[str] = mapped_column(String, nullable=False) + DeltaMass: Mapped[float | None] = mapped_column(Float) + Substitution: Mapped[str | None] = mapped_column(String) + LeavingGroup: Mapped[str | None] = mapped_column(String) + Abbreviation: Mapped[str] = mapped_column(String, nullable=False) + PositionType: Mapped[int] = mapped_column(Integer, nullable=False) + IsActive: Mapped[bool | None] = mapped_column(Boolean) + DeltaAverageMass: Mapped[float | None] = mapped_column(Float) + UnimodAccession: Mapped[str | None] = mapped_column(String) + IsSubstitution: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default=text("0")) class AminoAcidModificationsAminoAcid(Base): __tablename__ = "AminoAcidModificationsAminoAcids" - AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) - AminoAcidID = Column(Integer, primary_key=True, nullable=False) - Classification = Column(Integer, nullable=False) + AminoAcidModificationID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + AminoAcidID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Classification: Mapped[int] = mapped_column(Integer, nullable=False) class AminoAcidModificationsAminoAcidsNL(Base): __tablename__ = "AminoAcidModificationsAminoAcidsNL" - AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) - AminoAcidID = Column(Integer, primary_key=True, nullable=False) - NeutralLossID = Column(Integer, primary_key=True, nullable=False) + AminoAcidModificationID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + AminoAcidID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + NeutralLossID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) class AminoAcidModificationsNeutralLoss(Base): __tablename__ = "AminoAcidModificationsNeutralLosses" - NeutralLossID = Column(Integer, primary_key=True) - Name = Column(String, nullable=False) - MonoisotopicMass = Column(Float, nullable=False) - AverageMass = Column(Float, nullable=False) + NeutralLossID: Mapped[int] = mapped_column(Integer, primary_key=True) + Name: Mapped[str] = mapped_column(String, nullable=False) + MonoisotopicMass: Mapped[float] = mapped_column(Float, nullable=False) + AverageMass: Mapped[float] = mapped_column(Float, nullable=False) class AminoAcid(Base): __tablename__ = "AminoAcids" - AminoAcidID = Column(Integer, primary_key=True) - AminoAcidName = Column(String, nullable=False) - OneLetterCode = Column(CHAR) - ThreeLetterCode = Column(CHAR) - MonoisotopicMass = Column(Float, nullable=False) - AverageMass = Column(Float, nullable=False) - SumFormula = Column(String) + AminoAcidID: Mapped[int] = mapped_column(Integer, primary_key=True) + AminoAcidName: Mapped[str] = mapped_column(String, nullable=False) + OneLetterCode: Mapped[str | None] = mapped_column(CHAR) + ThreeLetterCode: Mapped[str | None] = mapped_column(CHAR) + MonoisotopicMass: Mapped[float] = mapped_column(Float, nullable=False) + AverageMass: Mapped[float] = mapped_column(Float, nullable=False) + SumFormula: Mapped[str | None] = mapped_column(String) class AnnotationDataVersion(Base): __tablename__ = "AnnotationDataVersion" - PcDataVersion = Column(Integer, primary_key=True) - PcDataRelease = Column(BigInteger, nullable=False) + PcDataVersion: Mapped[int] = mapped_column(Integer, primary_key=True) + PcDataRelease: Mapped[int] = mapped_column(BigInteger, nullable=False) class AnnotationDataset(Base): __tablename__ = "AnnotationDataset" - DatasetId = Column(Integer, primary_key=True) - Name = Column(String, nullable=False) - DisplayName = Column(String, nullable=False) - Guid = Column(String, nullable=False) - Description = Column(Text) + DatasetId: Mapped[int] = mapped_column(Integer, primary_key=True) + Name: Mapped[str] = mapped_column(String, nullable=False) + DisplayName: Mapped[str] = mapped_column(String, nullable=False) + Guid: Mapped[str] = mapped_column(String, nullable=False) + Description: Mapped[str | None] = mapped_column(Text) class AnnotationGroup(Base): __tablename__ = "AnnotationGroups" - AnnotationGroupId = Column(Integer, primary_key=True, nullable=False) - Description = Column(Text) - DatasetId = Column(Integer, primary_key=True, nullable=False) - Position = Column(Integer, nullable=False) - ColorR = Column(Integer, nullable=False) - ColorG = Column(Integer, nullable=False) - ColorB = Column(Integer, nullable=False) - GroupDefinition = Column(LargeBinary) + AnnotationGroupId: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Description: Mapped[str | None] = mapped_column(Text) + DatasetId: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Position: Mapped[int] = mapped_column(Integer, nullable=False) + ColorR: Mapped[int] = mapped_column(Integer, nullable=False) + ColorG: Mapped[int] = mapped_column(Integer, nullable=False) + ColorB: Mapped[int] = mapped_column(Integer, nullable=False) + GroupDefinition: Mapped[bytes | None] = mapped_column(LargeBinary) class AnnotationType(Base): __tablename__ = "AnnotationTypes" - AnnotationTypeId = Column(Integer, primary_key=True) - Name = Column(String, nullable=False) - Description = Column(Text) + AnnotationTypeId: Mapped[int] = mapped_column(Integer, primary_key=True) + Name: Mapped[str] = mapped_column(String, nullable=False) + Description: Mapped[str | None] = mapped_column(Text) class Annotation(Base): __tablename__ = "Annotations" - AnnotationId = Column(Integer, primary_key=True) - Accession = Column(String, nullable=False) - Description = Column(Text) - type = Column(Integer) + AnnotationId: Mapped[int] = mapped_column(Integer, primary_key=True) + Accession: Mapped[str] = mapped_column(String, nullable=False) + Description: Mapped[str | None] = mapped_column(Text) + type: Mapped[int | None] = mapped_column(Integer) class AnnotationsAnnotationGroup(Base): __tablename__ = "AnnotationsAnnotationGroups" - AnnotationId = Column(Integer, primary_key=True, nullable=False) - AnnotationGroupId = Column(Integer, primary_key=True, nullable=False) + AnnotationId: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + AnnotationGroupId: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) class AnnotationsProtein(Base): __tablename__ = "AnnotationsProtein" - proteinID = Column(Integer, primary_key=True, nullable=False) - AnnotationId = Column(Integer, primary_key=True, nullable=False) - Evidence = Column(Integer, primary_key=True) - PositionBegin = Column(Integer, primary_key=True) - PositionEnd = Column(Integer) - ProteinAccession = Column(String, primary_key=True, nullable=False) + proteinID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + AnnotationId: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Evidence: Mapped[int | None] = mapped_column(Integer, primary_key=True) + PositionBegin: Mapped[int | None] = mapped_column(Integer, primary_key=True) + PositionEnd: Mapped[int | None] = mapped_column(Integer) + ProteinAccession: Mapped[str] = mapped_column(String, primary_key=True, nullable=False) class Chromatogram(Base): __tablename__ = "Chromatograms" - FileID = Column(Integer, primary_key=True, nullable=False) - TraceType = Column(Integer, primary_key=True, nullable=False) - Chromatogram = Column(String, nullable=False) + FileID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + TraceType: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Chromatogram: Mapped[str] = mapped_column(String, nullable=False) class CustomDataField(Base): __tablename__ = "CustomDataFields" - FieldID = Column(Integer, primary_key=True) - Guid = Column(String, nullable=False) - DisplayName = Column(String, nullable=False) - SourceNodeNumber = Column(Integer, nullable=False) - TargetNodeNumber = Column(Integer, nullable=False) - DataType = Column(Integer, nullable=False) - DataTarget = Column(Integer, nullable=False) - Version = Column(Float, nullable=False) - AccessMode = Column(Integer, server_default=text("0")) - Visibility = Column(Integer, server_default=text("0")) - GroupVisibility = Column(Integer, server_default=text("0")) - Format = Column(String) - PlotType = Column(Integer, nullable=False) - DataPurpose = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True) + Guid: Mapped[str] = mapped_column(String, nullable=False) + DisplayName: Mapped[str] = mapped_column(String, nullable=False) + SourceNodeNumber: Mapped[int] = mapped_column(Integer, nullable=False) + TargetNodeNumber: Mapped[int] = mapped_column(Integer, nullable=False) + DataType: Mapped[int] = mapped_column(Integer, nullable=False) + DataTarget: Mapped[int] = mapped_column(Integer, nullable=False) + Version: Mapped[float] = mapped_column(Float, nullable=False) + AccessMode: Mapped[int | None] = mapped_column(Integer, server_default=text("0")) + Visibility: Mapped[int | None] = mapped_column(Integer, server_default=text("0")) + GroupVisibility: Mapped[int | None] = mapped_column(Integer, server_default=text("0")) + Format: Mapped[str | None] = mapped_column(String) + PlotType: Mapped[int] = mapped_column(Integer, nullable=False) + DataPurpose: Mapped[str | None] = mapped_column(String) class CustomDataPeptide(Base): __tablename__ = "CustomDataPeptides" - FieldID = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + FieldValue: Mapped[str | None] = mapped_column(String) class CustomDataPeptidesDecoy(Base): __tablename__ = "CustomDataPeptides_decoy" - FieldID = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + FieldValue: Mapped[str | None] = mapped_column(String) class CustomDataProcessingNode(Base): __tablename__ = "CustomDataProcessingNodes" - FieldID = Column(Integer, primary_key=True, nullable=False) - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column( + Integer, primary_key=True, nullable=False, index=True + ) + FieldValue: Mapped[str | None] = mapped_column(String) class CustomDataProtein(Base): __tablename__ = "CustomDataProteins" - FieldID = Column(Integer, primary_key=True, nullable=False) - ProteinID = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + FieldValue: Mapped[str | None] = mapped_column(String) class CustomDataProteinsDecoy(Base): __tablename__ = "CustomDataProteins_decoy" - FieldID = Column(Integer, primary_key=True, nullable=False) - ProteinID = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + FieldValue: Mapped[str | None] = mapped_column(String) class CustomDataSpectra(Base): __tablename__ = "CustomDataSpectra" - FieldID = Column(Integer, primary_key=True, nullable=False) - SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True) - FieldValue = Column(String) + FieldID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + SpectrumID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + FieldValue: Mapped[str | None] = mapped_column(String) class Enzyme(Base): __tablename__ = "Enzymes" - EnzymeID = Column(Integer, primary_key=True) - Name = Column(String, nullable=False) - Abbreviation = Column(String, nullable=False) - Seperator = Column(String, nullable=False) - NonSeperator = Column(String, nullable=False) - Offset = Column(Integer, nullable=False) + EnzymeID: Mapped[int] = mapped_column(Integer, primary_key=True) + Name: Mapped[str] = mapped_column(String, nullable=False) + Abbreviation: Mapped[str] = mapped_column(String, nullable=False) + Seperator: Mapped[str] = mapped_column(String, nullable=False) + NonSeperator: Mapped[str] = mapped_column(String, nullable=False) + Offset: Mapped[int] = mapped_column(Integer, nullable=False) class EnzymesCleavageSpecificity(Base): __tablename__ = "EnzymesCleavageSpecificities" - EnzymeID = Column(Integer, primary_key=True, nullable=False) - Specificity = Column(Integer, primary_key=True, nullable=False) + EnzymeID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Specificity: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) class EventAnnotation(Base): @@ -248,20 +276,20 @@ class EventAnnotation(Base): Index("IX_EventAnnotations_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID"), ) - EventID = Column(Integer, primary_key=True) - Charge = Column(SmallInteger, nullable=False) - IsotopePatternID = Column(Integer, nullable=False) - QuanResultID = Column(Integer, nullable=False) - QuanChannelID = Column(Integer, nullable=False) + EventID: Mapped[int] = mapped_column(Integer, primary_key=True) + Charge: Mapped[int] = mapped_column(SmallInteger, nullable=False) + IsotopePatternID: Mapped[int] = mapped_column(Integer, nullable=False) + QuanResultID: Mapped[int] = mapped_column(Integer, nullable=False) + QuanChannelID: Mapped[int] = mapped_column(Integer, nullable=False) class EventAreaAnnotation(Base): __tablename__ = "EventAreaAnnotations" - EventID = Column(Integer, primary_key=True) - Charge = Column(SmallInteger, nullable=False) - IsotopePatternID = Column(Integer, nullable=False, index=True) - QuanResultID = Column(Integer, nullable=False) + EventID: Mapped[int] = mapped_column(Integer, primary_key=True) + Charge: Mapped[int] = mapped_column(SmallInteger, nullable=False) + IsotopePatternID: Mapped[int] = mapped_column(Integer, nullable=False, index=True) + QuanResultID: Mapped[int] = mapped_column(Integer, nullable=False) class Event(Base): @@ -271,316 +299,325 @@ class Event(Base): Index("IX_Events_FileID_RT", "FileID", "RT"), ) - EventID = Column(Integer, primary_key=True) - Mass = Column(Float, nullable=False) - MassAvg = Column(Float, nullable=False) - Area = Column(Float, nullable=False) - Intensity = Column(Float, nullable=False) - PeakWidth = Column(Float, nullable=False) - RT = Column(Float, nullable=False) - LeftRT = Column(Float, nullable=False) - RightRT = Column(Float, nullable=False) - SN = Column(Float, nullable=False, server_default=text("0.0")) - FileID = Column(Integer, nullable=False) + EventID: Mapped[int] = mapped_column(Integer, primary_key=True) + Mass: Mapped[float] = mapped_column(Float, nullable=False) + MassAvg: Mapped[float] = mapped_column(Float, nullable=False) + Area: Mapped[float] = mapped_column(Float, nullable=False) + Intensity: Mapped[float] = mapped_column(Float, nullable=False) + PeakWidth: Mapped[float] = mapped_column(Float, nullable=False) + RT: Mapped[float] = mapped_column(Float, nullable=False) + LeftRT: Mapped[float] = mapped_column(Float, nullable=False) + RightRT: Mapped[float] = mapped_column(Float, nullable=False) + SN: Mapped[float] = mapped_column(Float, nullable=False, server_default=text("0.0")) + FileID: Mapped[int] = mapped_column(Integer, nullable=False) class FastaFile(Base): __tablename__ = "FastaFiles" - FastaFileID = Column(Integer, primary_key=True) - FileName = Column(String, nullable=False) - State = Column(Integer, nullable=False) - VirtualFileName = Column(String, nullable=False) - FileSize = Column(BigInteger, nullable=False) - FileTime = Column(BigInteger, nullable=False) - NumberOfProteins = Column(BigInteger) - NumberOfAminoAcids = Column(BigInteger) - FileHashCode = Column(BigInteger) - Hidden = Column(Boolean, nullable=False) - IsSrfImport = Column(Boolean, nullable=False) - IsScheduledForDeletion = Column(Boolean, nullable=False, server_default=text("0")) + FastaFileID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + FileName: Mapped[str] = mapped_column(String, nullable=False) + State: Mapped[int] = mapped_column(Integer, nullable=False) + VirtualFileName: Mapped[str] = mapped_column(String, nullable=False) + FileSize: Mapped[int] = mapped_column(BigInteger, nullable=False) + FileTime: Mapped[int] = mapped_column(BigInteger, nullable=False) + NumberOfProteins: Mapped[int | None] = mapped_column(BigInteger) + NumberOfAminoAcids: Mapped[int | None] = mapped_column(BigInteger) + FileHashCode: Mapped[int | None] = mapped_column(BigInteger) + Hidden: Mapped[bool] = mapped_column(Boolean, nullable=False) + IsSrfImport: Mapped[bool] = mapped_column(Boolean, nullable=False) + IsScheduledForDeletion: Mapped[bool] = mapped_column( + Boolean, nullable=False, server_default=text("0") + ) class FastaFilesProteinAnnotation(Base): __tablename__ = "FastaFilesProteinAnnotations" - FastaFileID = Column(Integer, primary_key=True, nullable=False) - ProteinAnnotationID = Column(Integer, primary_key=True, nullable=False, index=True) + FastaFileID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProteinAnnotationID: Mapped[int] = mapped_column( + Integer, primary_key=True, nullable=False, index=True + ) class FileInfo(Base): __tablename__ = "FileInfos" - FileID = Column(Integer, primary_key=True) - FileName = Column(String, nullable=False) - FileTime = Column(String, nullable=False) - FileSize = Column(BigInteger, nullable=False) - PhysicalFileName = Column(String, nullable=False) - FileType = Column(SmallInteger, nullable=False) + FileID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + FileName: Mapped[str] = mapped_column(String, nullable=False) + FileTime: Mapped[str] = mapped_column(String, nullable=False) + FileSize: Mapped[int] = mapped_column(BigInteger, nullable=False) + PhysicalFileName: Mapped[str] = mapped_column(String, nullable=False) + FileType: Mapped[int] = mapped_column(SmallInteger, nullable=False) class MassPeakRelation(Base): __tablename__ = "MassPeakRelations" - MassPeakID = Column(Integer, primary_key=True, nullable=False) - RelatedMassPeakID = Column(Integer, primary_key=True, nullable=False) + MassPeakID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + RelatedMassPeakID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) class MassPeak(Base): __tablename__ = "MassPeaks" - MassPeakID = Column(Integer, primary_key=True) - Charge = Column(SmallInteger) - Intensity = Column(Float) - Mass = Column(Float) - ScanNumbers = Column(String) - FileID = Column(Integer) - PercentIsolationInterference = Column(Float) - IonInjectTime = Column(Integer) + MassPeakID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + Charge: Mapped[int | None] = mapped_column(SmallInteger) + Intensity: Mapped[float | None] = mapped_column(Float) + Mass: Mapped[float | None] = mapped_column(Float) + ScanNumbers: Mapped[str | None] = mapped_column(String) + FileID: Mapped[int | None] = mapped_column(Integer) + PercentIsolationInterference: Mapped[float | None] = mapped_column(Float) + IonInjectTime: Mapped[int | None] = mapped_column(Integer) class PeptideScore(Base): __tablename__ = "PeptideScores" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - ScoreID = Column(Integer, primary_key=True, nullable=False) - ProcessingNodeID = Column(Integer) - ScoreValue = Column(Float, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ScoreID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProcessingNodeID: Mapped[int | None] = mapped_column(Integer) + ScoreValue: Mapped[float] = mapped_column(Float, nullable=False) class PeptideScoreDecoy(Base): __tablename__ = "PeptideScores_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - ScoreID = Column(Integer, primary_key=True, nullable=False) - ProcessingNodeID = Column(Integer) - ScoreValue = Column(Float, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ScoreID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProcessingNodeID: Mapped[int | None] = mapped_column(Integer) + ScoreValue: Mapped[float] = mapped_column(Float, nullable=False) class Peptide(Base): __tablename__ = "Peptides" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - SpectrumID = Column(Integer, nullable=False, index=True) - TotalIonsCount = Column(SmallInteger, nullable=False) - MatchedIonsCount = Column(SmallInteger, nullable=False) - ConfidenceLevel = Column(SmallInteger, nullable=False) - SearchEngineRank = Column(Integer, nullable=False) - Hidden = Column(Boolean, nullable=False, server_default=text("0")) - Sequence = Column(String) - Annotation = Column(String) - UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1")) - MissedCleavages = Column(SmallInteger, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + SpectrumID: Mapped[int] = mapped_column(Integer, nullable=False, index=True) + TotalIonsCount: Mapped[int] = mapped_column(SmallInteger, nullable=False) + MatchedIonsCount: Mapped[int] = mapped_column(SmallInteger, nullable=False) + ConfidenceLevel: Mapped[int] = mapped_column(SmallInteger, nullable=False) + SearchEngineRank: Mapped[int] = mapped_column(Integer, nullable=False) + Hidden: Mapped[bool] = mapped_column(Boolean, nullable=False, server_default=text("0")) + Sequence: Mapped[str | None] = mapped_column(String) + Annotation: Mapped[str | None] = mapped_column(String) + UniquePeptideSequenceID: Mapped[int] = mapped_column( + Integer, nullable=False, server_default=text("1") + ) + MissedCleavages: Mapped[int] = mapped_column(SmallInteger, nullable=False) class PeptidesAminoAcidModification(Base): __tablename__ = "PeptidesAminoAcidModifications" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) - Position = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True) + AminoAcidModificationID: Mapped[int] = mapped_column(Integer, primary_key=True) + Position: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptidesAminoAcidModificationsDecoy(Base): __tablename__ = "PeptidesAminoAcidModifications_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - AminoAcidModificationID = Column(Integer, primary_key=True, nullable=False) - Position = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True) + AminoAcidModificationID: Mapped[int] = mapped_column(Integer, primary_key=True) + Position: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptidesProtein(Base): __tablename__ = "PeptidesProteins" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - ProteinID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, index=True) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptidesProteinDecoy(Base): __tablename__ = "PeptidesProteins_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - ProteinID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, index=True) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptidesReferenceSpectra(Base): __tablename__ = "PeptidesReferenceSpectra" - PeptideID = Column(Integer, primary_key=True) - ReferenceSpectrumID = Column(Integer, nullable=False) + PeptideID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ReferenceSpectrumID: Mapped[int] = mapped_column(Integer) class PeptidesTerminalModification(Base): __tablename__ = "PeptidesTerminalModifications" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - TerminalModificationID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True) + TerminalModificationID: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptidesTerminalModificationDecoy(Base): __tablename__ = "PeptidesTerminalModifications_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False) - TerminalModificationID = Column(Integer, primary_key=True, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True) + TerminalModificationID: Mapped[int] = mapped_column(Integer, primary_key=True) class PeptideDecoy(Base): __tablename__ = "Peptides_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - PeptideID = Column(Integer, primary_key=True, nullable=False, index=True) - SpectrumID = Column(Integer, nullable=False, index=True) - TotalIonsCount = Column(SmallInteger, nullable=False) - MatchedIonsCount = Column(SmallInteger, nullable=False) - ConfidenceLevel = Column(SmallInteger, nullable=False) - SearchEngineRank = Column(Integer, nullable=False) - Sequence = Column(String) - Annotation = Column(String) - UniquePeptideSequenceID = Column(Integer, nullable=False, server_default=text("1")) - MissedCleavages = Column(SmallInteger, nullable=False) - - -t_PrecursorIonAreaSearchSpectra = Table( - "PrecursorIonAreaSearchSpectra", - metadata, - Column("QuanResultID", Integer, nullable=False, index=True), - Column("SearchSpectrumID", Integer), -) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + PeptideID: Mapped[int] = mapped_column(Integer, primary_key=True, index=True) + SpectrumID: Mapped[int] = mapped_column(Integer, index=True) + TotalIonsCount: Mapped[int] = mapped_column(SmallInteger) + MatchedIonsCount: Mapped[int] = mapped_column(SmallInteger) + ConfidenceLevel: Mapped[int] = mapped_column(SmallInteger) + SearchEngineRank: Mapped[int] = mapped_column(Integer) + Sequence: Mapped[str | None] = mapped_column(String) + Annotation: Mapped[str | None] = mapped_column(String) + UniquePeptideSequenceID: Mapped[int] = mapped_column(Integer, server_default=text("1")) + MissedCleavages: Mapped[int] = mapped_column(SmallInteger) -t_PrecursorIonQuanResults = Table( - "PrecursorIonQuanResults", - metadata, - Column("QuanChannelID", Integer, nullable=False), - Column("QuanResultID", Integer, nullable=False), - Column("Mass", Float, nullable=False), - Column("Charge", Integer, nullable=False), - Column("Area", Float), - Column("RetentionTime", Float), - Index( - "IX_PrecursorIonQuanResults_QuanResultID_QuanChannelID", "QuanResultID", "QuanChannelID" - ), -) +class PrecursorIonAreaSearchSpectra(Base): + __tablename__ = "PrecursorIonAreaSearchSpectra" + QuanResultID: Mapped[int] = mapped_column( + Integer, primary_key=True, nullable=False, index=True + ) + SearchSpectrumID: Mapped[int | None] = mapped_column(Integer, primary_key=True) -t_PrecursorIonQuanResultsSearchSpectra = Table( - "PrecursorIonQuanResultsSearchSpectra", - metadata, - Column("ProcessingNodeNumber", Integer, nullable=False), - Column("QuanResultID", Integer, nullable=False, index=True), - Column("SearchSpectrumID", Integer, index=True), -) +class PrecursorIonQuanResult(Base): + __tablename__ = "PrecursorIonQuanResults" + __table_args__ = ( + Index( + "IX_PrecursorIonQuanResults_QuanResultID_QuanChannelID", + "QuanResultID", + "QuanChannelID", + ), + ) -t_ProcessingNodeConnectionPoints = Table( - "ProcessingNodeConnectionPoints", - metadata, - Column("ProcessingNodeID", Integer, nullable=False), - Column("Interface", String, nullable=False), - Column("ConnectionDirection", Integer, nullable=False), - Column("ConnectionMode", Integer, nullable=False), - Column("ConnectionMultiplicity", Integer, nullable=False), - Column("ConnectionRequirement", Integer, nullable=False), - Column("DataTypeSpecialization", String, nullable=False), - Column("ConnectionDisplayName", String, nullable=False), -) + QuanChannelID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + QuanResultID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Mass: Mapped[float] = mapped_column(Float, nullable=False) + Charge: Mapped[int] = mapped_column(Integer, nullable=False) + Area: Mapped[float | None] = mapped_column(Float) + RetentionTime: Mapped[float | None] = mapped_column(Float) + + +class PrecursorIonQuanResultsSearchSpectra(Base): + __tablename__ = "PrecursorIonQuanResultsSearchSpectra" + + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + QuanResultID: Mapped[int] = mapped_column( + Integer, primary_key=True, nullable=False, index=True + ) + SearchSpectrumID: Mapped[int | None] = mapped_column(Integer, index=True) + + +class ProcessingNodeConnectionPoint(Base): + __tablename__ = "ProcessingNodeConnectionPoints" + + ProcessingNodeID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Interface: Mapped[str] = mapped_column(String, primary_key=True, nullable=False) + ConnectionDirection: Mapped[int] = mapped_column(Integer, nullable=False) + ConnectionMode: Mapped[int] = mapped_column(Integer, nullable=False) + ConnectionMultiplicity: Mapped[int] = mapped_column(Integer, nullable=False) + ConnectionRequirement: Mapped[int] = mapped_column(Integer, nullable=False) + DataTypeSpecialization: Mapped[str] = mapped_column(String, nullable=False) + ConnectionDisplayName: Mapped[str] = mapped_column(String, nullable=False) class ProcessingNodeExtension(Base): __tablename__ = "ProcessingNodeExtensions" - ExtensionID = Column(Integer, primary_key=True) - ProcessingNodeNumber = Column(Integer, nullable=False) - Guid = Column(String, nullable=False) - Purpose = Column(String, nullable=False) - PurposeDetail = Column(String) - MajorVersion = Column(Integer, nullable=False) - MinorVersion = Column(Integer, nullable=False) - Settings = Column(Text) + ExtensionID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer) + Guid: Mapped[str] = mapped_column(String) + Purpose: Mapped[str] = mapped_column(String) + PurposeDetail: Mapped[str | None] = mapped_column(String) + MajorVersion: Mapped[int] = mapped_column(Integer) + MinorVersion: Mapped[int] = mapped_column(Integer) + Settings: Mapped[str | None] = mapped_column(Text) class ProcessingNodeFilterParameter(Base): __tablename__ = "ProcessingNodeFilterParameters" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - FilterParameterName = Column(String, primary_key=True, nullable=False) - FilterModuleTypeID = Column(Integer, nullable=False) - FilterModuleNumber = Column(Integer, nullable=False) - ProcessingNodeID = Column(Integer, nullable=False) - FilterParameterValue = Column(Float, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + FilterParameterName: Mapped[str] = mapped_column(String, primary_key=True) + FilterModuleTypeID: Mapped[int] = mapped_column(Integer) + FilterModuleNumber: Mapped[int] = mapped_column(Integer) + ProcessingNodeID: Mapped[int] = mapped_column(Integer) + FilterParameterValue: Mapped[float] = mapped_column(Float) -t_ProcessingNodeInterfaces = Table( - "ProcessingNodeInterfaces", - metadata, - Column("ProcessingNodeID", Integer, nullable=False), - Column("InterfaceKind", Integer, nullable=False), - Column("InterfaceName", String, nullable=False), -) +class ProcessingNodeInterface(Base): + __tablename__ = "ProcessingNodeInterfaces" + + ProcessingNodeID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + InterfaceKind: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + InterfaceName: Mapped[str] = mapped_column(String, primary_key=True, nullable=False) class ProcessingNodeParameter(Base): __tablename__ = "ProcessingNodeParameters" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - ParameterName = Column(String, primary_key=True, nullable=False) - FriendlyName = Column(String, nullable=False) - ProcessingNodeID = Column(Integer, nullable=False) - IntendedPurpose = Column(Integer, nullable=False) - PurposeDetails = Column(String, nullable=False) - Hidden = Column(Boolean, nullable=False) - Advanced = Column(Boolean, nullable=False) - Category = Column(String, nullable=False) - Position = Column(Integer, nullable=False) - ParameterValue = Column(String, nullable=False) - ValueDisplayString = Column(String, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + ParameterName: Mapped[str] = mapped_column(String, primary_key=True) + FriendlyName: Mapped[str] = mapped_column(String) + ProcessingNodeID: Mapped[int] = mapped_column(Integer) + IntendedPurpose: Mapped[int] = mapped_column(Integer) + PurposeDetails: Mapped[str] = mapped_column(String) + Hidden: Mapped[bool] = mapped_column(Boolean) + Advanced: Mapped[bool] = mapped_column(Boolean) + Category: Mapped[str] = mapped_column(String) + Position: Mapped[int] = mapped_column(Integer) + ParameterValue: Mapped[str] = mapped_column(String) + ValueDisplayString: Mapped[str] = mapped_column(String) class ProcessingNodeScore(Base): __tablename__ = "ProcessingNodeScores" __table_args__ = (UniqueConstraint("ProcessingNodeID", "ScoreName"),) - ProcessingNodeID = Column(Integer, nullable=False) - ScoreID = Column(Integer, primary_key=True) - ScoreName = Column(String, nullable=False) - FriendlyName = Column(String, nullable=False) - Description = Column(String, nullable=False) - FormatString = Column(String, nullable=False) - ScoreCategory = Column(Integer, nullable=False) - Hidden = Column(Boolean, nullable=False) - IsMainScore = Column(Boolean, nullable=False) - ScoreGUID = Column(String, nullable=False) + ProcessingNodeID: Mapped[int] = mapped_column(Integer) + ScoreID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ScoreName: Mapped[str] = mapped_column(String) + FriendlyName: Mapped[str] = mapped_column(String) + Description: Mapped[str] = mapped_column(String) + FormatString: Mapped[str] = mapped_column(String) + ScoreCategory: Mapped[int] = mapped_column(Integer) + Hidden: Mapped[bool] = mapped_column(Boolean) + IsMainScore: Mapped[bool] = mapped_column(Boolean) + ScoreGUID: Mapped[str] = mapped_column(String) class ProcessingNode(Base): __tablename__ = "ProcessingNodes" - ProcessingNodeNumber = Column(Integer, primary_key=True) - ProcessingNodeID = Column(Integer, nullable=False) - ProcessingNodeParentNumber = Column(String, nullable=False) - NodeName = Column(String) - FriendlyName = Column(String, nullable=False) - MajorVersion = Column(Integer, nullable=False) - MinorVersion = Column(Integer, nullable=False) - NodeComment = Column(String) - NodeGUID = Column(String, nullable=False) - ProcessingNodeState = Column(Integer, nullable=False, server_default=text("0")) + ProcessingNodeNumber: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ProcessingNodeID: Mapped[int] = mapped_column(Integer) + ProcessingNodeParentNumber: Mapped[str] = mapped_column(String) + NodeName: Mapped[str | None] = mapped_column(String) + FriendlyName: Mapped[str] = mapped_column(String) + MajorVersion: Mapped[int] = mapped_column(Integer) + MinorVersion: Mapped[int] = mapped_column(Integer) + NodeComment: Mapped[str | None] = mapped_column(String) + NodeGUID: Mapped[str] = mapped_column(String) + ProcessingNodeState: Mapped[int] = mapped_column(Integer, server_default=text("0")) class ProcessingNodesSpectra(Base): __tablename__ = "ProcessingNodesSpectra" - SendingProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - SpectrumID = Column(Integer, primary_key=True, nullable=False, index=True) + SendingProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + SpectrumID: Mapped[int] = mapped_column(Integer, primary_key=True, index=True) class ProteinAnnotation(Base): @@ -593,77 +630,76 @@ class ProteinAnnotation(Base): ), ) - ProteinAnnotationID = Column(Integer, primary_key=True) - ProteinID = Column(Integer, nullable=False) - DescriptionHashCode = Column(BigInteger, nullable=False) - Description = Column(Text, nullable=False) - TaxonomyID = Column(Integer, nullable=False, index=True) + ProteinAnnotationID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ProteinID: Mapped[int] = mapped_column(Integer) + DescriptionHashCode: Mapped[int] = mapped_column(BigInteger) + Description: Mapped[str] = mapped_column(Text) + TaxonomyID: Mapped[int] = mapped_column(Integer, index=True) class ProteinIdentificationGroup(Base): __tablename__ = "ProteinIdentificationGroups" - ProteinIdentificationGroupId = Column(Integer, primary_key=True, nullable=False) - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) + ProteinIdentificationGroupId: Mapped[int] = mapped_column(Integer, primary_key=True) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) class ProteinScore(Base): __tablename__ = "ProteinScores" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - ProteinID = Column(Integer, primary_key=True, nullable=False) - ProteinIdentificationGroupID = Column(Integer, nullable=False) - ProteinScore = Column(Float, nullable=False) - Coverage = Column(Float, nullable=False, server_default=text("0")) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True) + ProteinIdentificationGroupID: Mapped[int] = mapped_column(Integer) + ProteinScore: Mapped[float] = mapped_column(Float) + Coverage: Mapped[float] = mapped_column(Float, server_default=text("0")) class ProteinScoresDecoy(Base): __tablename__ = "ProteinScores_decoy" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - ProteinID = Column(Integer, primary_key=True, nullable=False) - ProteinIdentificationGroupID = Column(Integer, nullable=False) - ProteinScore = Column(Float, nullable=False) - Coverage = Column(Float, nullable=False, server_default=text("0")) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True) + ProteinIdentificationGroupID: Mapped[int] = mapped_column(Integer) + ProteinScore: Mapped[float] = mapped_column(Float) + Coverage: Mapped[float] = mapped_column(Float, server_default=text("0")) class Protein(Base): __tablename__ = "Proteins" - ProteinID = Column(Integer, primary_key=True) - Sequence = Column(Text, nullable=False) - SequenceHashCode = Column(BigInteger, nullable=False, index=True) - IsMasterProtein = Column(Boolean, nullable=False, server_default=text("0")) + ProteinID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + Sequence: Mapped[str] = mapped_column(Text) + SequenceHashCode: Mapped[int] = mapped_column(BigInteger, index=True) + IsMasterProtein: Mapped[bool] = mapped_column(Boolean, server_default=text("0")) -t_ProteinsProteinGroups = Table( - "ProteinsProteinGroups", - metadata, - Column("ProteinID", Integer, nullable=False), - Column("ProteinGroupID", Integer, nullable=False), -) +class ProteinsProteinGroup(Base): + __tablename__ = "ProteinsProteinGroups" + + ProteinID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + ProteinGroupID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) class PtmAnnotationDatum(Base): __tablename__ = "PtmAnnotationData" - AnnotationType = Column(Integer, primary_key=True, nullable=False) - ProteinId = Column(Integer, primary_key=True, nullable=False) - AnnotationId = Column(Integer, primary_key=True, nullable=False) - Position = Column(Integer, primary_key=True, nullable=False) - Annotation = Column(String) + AnnotationType: Mapped[int] = mapped_column(Integer, primary_key=True) + ProteinId: Mapped[int] = mapped_column(Integer, primary_key=True) + AnnotationId: Mapped[int] = mapped_column(Integer, primary_key=True) + Position: Mapped[int] = mapped_column(Integer, primary_key=True) + Annotation: Mapped[str | None] = mapped_column(String) class ReferenceSpectra(Base): __tablename__ = "ReferenceSpectra" - ReferenceSpectrumId = Column(Integer, primary_key=True) - Sequence = Column(String, nullable=False) - SequenceHashCode = Column(BigInteger, nullable=False) - Spectrum = Column(String, nullable=False) - SpectrumHashCode = Column(BigInteger, nullable=False) - Comment = Column(Text) - CommentHashCode = Column(BigInteger, nullable=False) + ReferenceSpectrumId: Mapped[int | None] = mapped_column(Integer, primary_key=True) + Sequence: Mapped[str] = mapped_column(String) + SequenceHashCode: Mapped[int] = mapped_column(BigInteger) + Spectrum: Mapped[str] = mapped_column(String) + SpectrumHashCode: Mapped[int] = mapped_column(BigInteger) + Comment: Mapped[str | None] = mapped_column(Text) + CommentHashCode: Mapped[int] = mapped_column(BigInteger) class ReporterIonQuanResult(Base): @@ -676,84 +712,82 @@ class ReporterIonQuanResult(Base): ), ) - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - QuanChannelID = Column(Integer, primary_key=True, nullable=False) - SpectrumID = Column(Integer, primary_key=True, nullable=False) - Mass = Column(Float, nullable=False) - Height = Column(Float) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + QuanChannelID: Mapped[int] = mapped_column(Integer, primary_key=True) + SpectrumID: Mapped[int] = mapped_column(Integer, primary_key=True) + Mass: Mapped[float] = mapped_column(Float) + Height: Mapped[float | None] = mapped_column(Float) -t_ReporterIonQuanResultsSearchSpectra = Table( - "ReporterIonQuanResultsSearchSpectra", - metadata, - Column("ProcessingNodeNumber", Integer, nullable=False), - Column("SpectrumID", Integer, nullable=False), - Column("SearchSpectrumID", Integer, index=True), -) +class ReporterIonQuanResultsSearchSpectra(Base): + __tablename__ = "ReporterIonQuanResultsSearchSpectra" + + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + SpectrumID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + SearchSpectrumID: Mapped[int | None] = mapped_column(Integer, index=True) class ScanEvent(Base): __tablename__ = "ScanEvents" - ScanEventID = Column(Integer, primary_key=True) - MSLevel = Column(Integer, nullable=False) - Polarity = Column(Integer, nullable=False) - ScanType = Column(Integer, nullable=False) - Ionization = Column(Integer, nullable=False) - MassAnalyzer = Column(Integer, nullable=False) - ActivationType = Column(Integer, nullable=False) + ScanEventID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + MSLevel: Mapped[int] = mapped_column(Integer) + Polarity: Mapped[int] = mapped_column(Integer) + ScanType: Mapped[int] = mapped_column(Integer) + Ionization: Mapped[int] = mapped_column(Integer) + MassAnalyzer: Mapped[int] = mapped_column(Integer) + ActivationType: Mapped[int] = mapped_column(Integer) class SchemaInfo(Base): __tablename__ = "SchemaInfo" - Version = Column(Integer, primary_key=True) - Kind = Column(String, nullable=False) - Date = Column(DateTime, nullable=False) - SoftwareVersion = Column(String, nullable=False) - Comment = Column(Text, nullable=False) + Version: Mapped[int | None] = mapped_column(Integer, primary_key=True) + Kind: Mapped[str] = mapped_column(String) + Date: Mapped[datetime] = mapped_column(DateTime) + SoftwareVersion: Mapped[str] = mapped_column(String) + Comment: Mapped[str] = mapped_column(Text) class Spectrum(Base): __tablename__ = "Spectra" - UniqueSpectrumID = Column(Integer, primary_key=True) - Spectrum = Column(String, nullable=False) - SpectrumHashCode = Column(BigInteger) + UniqueSpectrumID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + Spectrum: Mapped[str] = mapped_column(String) + SpectrumHashCode: Mapped[int | None] = mapped_column(BigInteger) class SpectrumHeader(Base): __tablename__ = "SpectrumHeaders" - SpectrumID = Column(Integer, primary_key=True) - MassPeakID = Column(Integer) - ScanEventID = Column(Integer) - LastScan = Column(Integer) - FirstScan = Column(Integer) - RetentionTime = Column(Float) - Hidden = Column(Boolean, nullable=False, server_default=text("0")) - ScanNumbers = Column(String) - Charge = Column(SmallInteger) - Mass = Column(Float) - CreatingProcessingNodeNumber = Column(Integer, nullable=False) - UniqueSpectrumID = Column(Integer, nullable=False, server_default=text("0")) + SpectrumID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + MassPeakID: Mapped[int | None] = mapped_column(Integer) + ScanEventID: Mapped[int | None] = mapped_column(Integer) + LastScan: Mapped[int | None] = mapped_column(Integer) + FirstScan: Mapped[int | None] = mapped_column(Integer) + RetentionTime: Mapped[float | None] = mapped_column(Float) + Hidden: Mapped[bool] = mapped_column(Boolean, server_default=text("0")) + ScanNumbers: Mapped[str | None] = mapped_column(String) + Charge: Mapped[int | None] = mapped_column(SmallInteger) + Mass: Mapped[float | None] = mapped_column(Float) + CreatingProcessingNodeNumber: Mapped[int] = mapped_column(Integer) + UniqueSpectrumID: Mapped[int] = mapped_column(Integer, server_default=text("0")) class SpectrumScore(Base): __tablename__ = "SpectrumScores" - ProcessingNodeNumber = Column(Integer, primary_key=True, nullable=False) - SpectrumID = Column(Integer, primary_key=True, nullable=False) - Score = Column(Float, nullable=False) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer, primary_key=True) + SpectrumID: Mapped[int] = mapped_column(Integer, primary_key=True) + Score: Mapped[float] = mapped_column(Float) -t_TaxonomyNames = Table( - "TaxonomyNames", - metadata, - Column("TaxonomyID", Integer, nullable=False, index=True), - Column("Name", String), - Column("NameCategory", Integer, nullable=False), -) +class TaxonomyName(Base): + __tablename__ = "TaxonomyNames" + + TaxonomyID: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False, index=True) + NameCategory: Mapped[int] = mapped_column(Integer, primary_key=True, nullable=False) + Name: Mapped[str | None] = mapped_column(String) class TaxonomyNode(Base): @@ -762,42 +796,37 @@ class TaxonomyNode(Base): Index("IX_TaxonomyNodes_LeftNodeIndex_RightNodeIndex", "LeftNodeIndex", "RightNodeIndex"), ) - TaxonomyID = Column(Integer, primary_key=True, unique=True) - ParentTaxonomyID = Column(Integer, nullable=False) - TaxonomyRank = Column(Integer, nullable=False) - LeftNodeIndex = Column(Integer, nullable=False) - RightNodeIndex = Column(Integer, nullable=False) - - -t_WorkflowInfo = Table( - "WorkflowInfo", - metadata, - Column("WorkflowName", String, nullable=False), - Column("WorkflowDescription", String, nullable=False), - Column("WorkflowState", Integer, nullable=False, server_default=text("0")), - Column("WorkflowStartDate", DateTime, nullable=False), - Column("WorkflowTemplate", String, nullable=False), - Column("User", String, nullable=False), - Column("WorkflowGUID", String, nullable=False), - Column("MachineGUID", String, nullable=False), - Column("MachineName", String, nullable=False), - Column("MergeSimilarIdentificationResults", Boolean, nullable=False), - Column("IsValid", Boolean, nullable=False), - Column("Version", Integer, nullable=False), -) + TaxonomyID: Mapped[int | None] = mapped_column(Integer, primary_key=True, unique=True) + ParentTaxonomyID: Mapped[int] = mapped_column(Integer) + TaxonomyRank: Mapped[int] = mapped_column(Integer) + LeftNodeIndex: Mapped[int] = mapped_column(Integer) + RightNodeIndex: Mapped[int] = mapped_column(Integer) -class WorkflowMessage(Base): - __tablename__ = "WorkflowMessages" +# TODO: Check which is primary key +class WorkflowInfo(Base): + __tablename__ = "WorkflowInfo" - MessageID = Column(Integer, primary_key=True) - ProcessingNodeID = Column(Integer, nullable=False) - ProcessingNodeNumber = Column(Integer, nullable=False) - Time = Column(BigInteger, nullable=False) - MessageKind = Column(Integer, nullable=False) - Message = Column(String, nullable=False) + WorkflowGUID: Mapped[str] = mapped_column(String, primary_key=True, nullable=False) + WorkflowName: Mapped[str] = mapped_column(String, nullable=False) + WorkflowDescription: Mapped[str] = mapped_column(String, nullable=False) + WorkflowState: Mapped[int] = mapped_column(Integer, nullable=False, server_default=text("0")) + WorkflowStartDate: Mapped[datetime] = mapped_column(DateTime, nullable=False) + WorkflowTemplate: Mapped[str] = mapped_column(String, nullable=False) + User: Mapped[str] = mapped_column(String, nullable=False) + MachineGUID: Mapped[str] = mapped_column(String, nullable=False) + MachineName: Mapped[str] = mapped_column(String, nullable=False) + MergeSimilarIdentificationResults: Mapped[bool] = mapped_column(Boolean, nullable=False) + IsValid: Mapped[bool] = mapped_column(Boolean, nullable=False) + Version: Mapped[int] = mapped_column(Integer, nullable=False) -t_sqlite_sequence = Table( - "sqlite_sequence", metadata, Column("name", NullType), Column("seq", NullType) -) +class WorkflowMessage(Base): + __tablename__ = "WorkflowMessages" + + MessageID: Mapped[int | None] = mapped_column(Integer, primary_key=True) + ProcessingNodeID: Mapped[int] = mapped_column(Integer) + ProcessingNodeNumber: Mapped[int] = mapped_column(Integer) + Time: Mapped[int] = mapped_column(BigInteger) + MessageKind: Mapped[int] = mapped_column(Integer) + Message: Mapped[str] = mapped_column(String) diff --git a/psm_utils/io/_utils.py b/psm_utils/io/_utils.py index e1572c2..01f714f 100644 --- a/psm_utils/io/_utils.py +++ b/psm_utils/io/_utils.py @@ -2,13 +2,14 @@ import sys -def set_csv_field_size_limit(): +def set_csv_field_size_limit() -> None: """ - Sets the maximum field size limit for reading CSV files. + Set the maximum field size limit for reading CSV files. - Note: - This function should be called before reading any CSV files to ensure that the field size - limit is properly set. + Notes + ----- + This function should be called before reading any CSV files to ensure that the field size + limit is properly set. """ max_int = sys.maxsize diff --git a/psm_utils/io/alphadia.py b/psm_utils/io/alphadia.py index 8f6e1b8..3b4c15b 100644 --- a/psm_utils/io/alphadia.py +++ b/psm_utils/io/alphadia.py @@ -4,7 +4,11 @@ import csv from abc import ABC -from typing import Iterable, Optional +from collections.abc import Iterator +from pathlib import Path +from typing import Any, cast + +import pandas as pd from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -14,7 +18,7 @@ set_csv_field_size_limit() # TODO: check -RESCORING_FEATURES = [ +RESCORING_FEATURES: list[str] = [ "rt_observed", "mobility_observed", "mz_observed", @@ -24,29 +28,37 @@ class AlphaDIAReader(ReaderBase, ABC): - def __init__(self, filename, *args, **kwargs): + """Reader for AlphaDIA TSV format.""" + + def __init__(self, filename: str | Path, *args: Any, **kwargs: Any) -> None: """ Reader for AlphaDIA ``precursor.tsv`` file. Parameters ---------- - filename : str or Path + filename Path to PSM file. + *args + Additional positional arguments for parent class. + **kwargs + Additional keyword arguments for parent class. """ super().__init__(filename, *args, **kwargs) - self.filename = filename - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with open(self.filename) as msms_in: reader = csv.DictReader(msms_in, delimiter="\t") for row in reader: - yield self._get_peptide_spectrum_match(row) + yield self._get_peptide_spectrum_match(row, self.filename) - def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + @staticmethod + def _get_peptide_spectrum_match( + psm_dict: dict[str, Any], filename: str | Path | None = None + ) -> PSM: """Parse a single PSM from a AlphaDIA PSM file.""" - rescoring_features = {} + rescoring_features: dict[str, Any] = {} for ft in RESCORING_FEATURES: try: rescoring_features[ft] = psm_dict[ft] @@ -54,7 +66,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: continue return PSM( - peptidoform=self._parse_peptidoform( + peptidoform=AlphaDIAReader._parse_peptidoform( psm_dict["sequence"], psm_dict["mods"], psm_dict["mod_sites"], psm_dict["charge"] ), spectrum_id=psm_dict["frame_start"], # TODO: needs to be checked @@ -70,20 +82,20 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: protein_list=psm_dict["proteins"].split(";"), rank=int(psm_dict["rank"]) + 1, # AlphaDIA ranks are 0-based source="AlphaDIA", - provenance_data=({"alphadia_filename": str(self.filename)}), + provenance_data=({"alphadia_filename": str(filename)} if filename else {}), metadata={}, rescoring_features=rescoring_features, ) @staticmethod - def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str]) -> str: + def _parse_peptidoform(sequence: str, mods: str, mod_sites: str, charge: str | None) -> str: """Parse a peptidoform from a AlphaDIA PSM file.""" # Parse modifications if mods: - sequence_list = [""] + list(sequence) + [""] # N-term, sequence, C-term - for mod, site in zip(mods.split(";"), mod_sites.split(";")): - site = int(site) - name = mod.split("@")[0] + sequence_list: list[str] = [""] + list(sequence) + [""] # N-term, sequence, C-term + for mod, site_str in zip(mods.split(";"), mod_sites.split(";")): + site: int = int(site_str) + name: str = mod.split("@")[0] # N-terminal modification if site == 0: sequence_list[0] = f"[{name}]-" @@ -102,11 +114,7 @@ def _parse_peptidoform(sequence: str, mods: str, mod_sites, charge: Optional[str return sequence @classmethod - def from_dataframe(cls, dataframe) -> PSMList: + def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: """Create a PSMList from a AlphaDIA Pandas DataFrame.""" - return PSMList( - psm_list=[ - cls._get_peptide_spectrum_match(cls(""), entry) - for entry in dataframe.to_dict(orient="records") - ] - ) + records = cast(list[dict[str, Any]], dataframe.to_dict(orient="records")) + return PSMList(psm_list=[cls._get_peptide_spectrum_match(entry) for entry in records]) diff --git a/psm_utils/io/diann.py b/psm_utils/io/diann.py index 1d93183..2d46a5b 100644 --- a/psm_utils/io/diann.py +++ b/psm_utils/io/diann.py @@ -1,12 +1,11 @@ """ -Reader for PSM files from DIA-NN +Reader for PSM files from DIA-NN. Reads the '.tsv' file as defined on the `DIA-NN documentation page `_. Notes ----- - - DIA-NN calculates q-values at both the run and library level. The run-level q-value is used as the PSM q-value. - DIA-NN currently does not return precursor m/z values. @@ -18,7 +17,11 @@ import csv import re -from typing import Iterable, Optional +from collections.abc import Iterator +from pathlib import Path +from typing import Any, cast + +import pandas as pd from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -27,7 +30,7 @@ set_csv_field_size_limit() -RESCORING_FEATURES = [ +RESCORING_FEATURES: list[str] = [ "RT", "Predicted.RT", "iRT", @@ -42,7 +45,9 @@ class DIANNTSVReader(ReaderBase): - def __init__(self, filename, *args, **kwargs) -> None: + """Reader for DIA-NN TSV format.""" + + def __init__(self, filename: str | Path, *args: Any, **kwargs: Any) -> None: """ Reader for DIA-NN '.tsv' file. @@ -50,21 +55,27 @@ def __init__(self, filename, *args, **kwargs) -> None: ---------- filename : str or Path Path to PSM file. + *args + Additional positional arguments passed to the base class. + **kwargs + Additional keyword arguments passed to the base class. """ super().__init__(filename, *args, **kwargs) - self.filename = filename - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with open(self.filename) as msms_in: reader = csv.DictReader(msms_in, delimiter="\t") for row in reader: - yield self._get_peptide_spectrum_match(row) + yield self._get_peptide_spectrum_match(row, self.filename) - def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + @staticmethod + def _get_peptide_spectrum_match( + psm_dict: dict[str, str], filename: str | Path | None = None + ) -> PSM: """Parse a single PSM from a DIA-NN PSM file.""" - rescoring_features = {} + rescoring_features: dict[str, Any] = {} for ft in RESCORING_FEATURES: try: rescoring_features[ft] = psm_dict[ft] @@ -72,7 +83,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: continue return PSM( - peptidoform=self._parse_peptidoform( + peptidoform=DIANNTSVReader._parse_peptidoform( psm_dict["Modified.Sequence"], psm_dict["Precursor.Charge"] ), spectrum_id=psm_dict["MS2.Scan"], @@ -87,20 +98,20 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: protein_list=psm_dict["Protein.Ids"].split(";"), source="diann", rank=None, - provenance_data=({"diann_filename": str(self.filename)}), + provenance_data=({"diann_filename": str(filename)} if filename else {}), rescoring_features=rescoring_features, metadata={}, ) @staticmethod - def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: + def _parse_peptidoform(peptide: str, charge: str | None) -> str: # Add charge if charge: peptide += f"/{int(float(charge))}" # Replace parentheses with square brackets and capitalize UniMod prefix - pattern = r"\(UniMod:(\d+)\)" - replacement = r"[UNIMOD:\1]" + pattern: str = r"\(UniMod:(\d+)\)" + replacement: str = r"[UNIMOD:\1]" peptide = re.sub(pattern, replacement, peptide) # Add hyphen for N-terminal modifications @@ -115,11 +126,7 @@ def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: return peptide @classmethod - def from_dataframe(cls, dataframe) -> PSMList: + def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: """Create a PSMList from a DIA-NN Pandas DataFrame.""" - return PSMList( - ptm_list=[ - cls._get_peptide_spectrum_match(cls(""), entry) - for entry in dataframe.to_dict(orient="records") - ] - ) + records = cast(list[dict[str, str]], dataframe.to_dict(orient="records")) + return PSMList(psm_list=[cls._get_peptide_spectrum_match(entry) for entry in records]) diff --git a/psm_utils/io/flashlfq.py b/psm_utils/io/flashlfq.py index e2dac9f..5ca1a13 100644 --- a/psm_utils/io/flashlfq.py +++ b/psm_utils/io/flashlfq.py @@ -19,8 +19,9 @@ import csv import logging +from collections.abc import Iterator from pathlib import Path -from typing import Optional, Union +from typing import Any import numpy as np @@ -38,12 +39,16 @@ class FlashLFQReader(ReaderBase): """Reader for FlashLFQ TSV format.""" - required_columns = ["Full Sequence", "Precursor Charge"] + required_columns: list[str] = ["Full Sequence", "Precursor Charge"] - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "rt") as open_file: + with open(self.filename) as open_file: reader = csv.DictReader(open_file, delimiter="\t") + if not reader.fieldnames: + raise PSMUtilsIOException( + f"FlashLFQ TSV file '{self.filename}' is empty or has no valid header." + ) if not all(col in reader.fieldnames for col in self.required_columns): raise PSMUtilsIOException( f"FlashLFQ TSV file must contain the following columns: {self.required_columns}" @@ -51,7 +56,7 @@ def __iter__(self): for i, row in enumerate(reader): yield self._parse_entry(row, spectrum_id=str(i)) - def _parse_entry(self, entry: dict, spectrum_id) -> PSM: + def _parse_entry(self, entry: dict[str, Any], spectrum_id: str) -> PSM: """Parse single FlashLFQ TSV entry to :py:class:`~psm_utils.psm.PSM`.""" # Replace empty strings with None entry = {k: v if v else None for k, v in entry.items()} @@ -66,7 +71,7 @@ def _parse_entry(self, entry: dict, spectrum_id) -> PSM: ) @staticmethod - def _parse_protein_list(protein_accession: Optional[str]) -> list[str]: + def _parse_protein_list(protein_accession: str | None) -> list[str]: """Parse protein list string to list of protein accessions.""" if not protein_accession: return [] @@ -81,14 +86,24 @@ def _parse_protein_list(protein_accession: Optional[str]) -> list[str]: class FlashLFQWriter(WriterBase): """Reader for FlashLFQ TSV format.""" + _default_fieldnames: list[str] = [ + "File Name", + "Base Sequence", + "Full Sequence", + "Peptide Monoisotopic Mass", + "Scan Retention Time", + "Precursor Charge", + "Protein Accession", + ] + def __init__( self, - filename: Union[str, Path], - *args, + filename: str | Path, + *args: Any, fdr_threshold: float = 0.01, only_targets: bool = True, - **kwargs, - ): + **kwargs: Any, + ) -> None: """ Reader for psm_utils TSV format. @@ -96,40 +111,37 @@ def __init__( ---------- filename Path to PSM file. + *args + Additional positional arguments passed to the base class. fdr_threshold FDR threshold for filtering PSMs. only_targets If True, only target PSMs are written to file. If False, both target and decoy PSMs are written. + **kwargs + Additional keyword arguments passed to the base class. """ super().__init__(filename, *args, **kwargs) - self.fdr_threshold = fdr_threshold - self.only_targets = only_targets + self.fdr_threshold: float = fdr_threshold + self.only_targets: bool = only_targets - self._open_file = None - self._writer = None - self.fieldnames = None + self._open_file: Any = None + self._writer: Any = None + self.fieldnames: list[str] | None = None def __enter__(self) -> FlashLFQWriter: + """Open file for writing and return self.""" if Path(self.filename).is_file(): # Get fieldnames from existing file - with open(self.filename, "rt") as open_file: + with open(self.filename) as open_file: # Get fieldnames self.fieldnames = open_file.readline().strip().split("\t") - mode = "at" + mode: str = "at" else: - # Set default fieldnames - self.fieldnames = [ - "File Name", - "Base Sequence", - "Full Sequence", - "Peptide Monoisotopic Mass", - "Scan Retention Time", - "Precursor Charge", - "Protein Accession", - ] + # Set default fieldnames; avoiding mutation of class variable + self.fieldnames = self._default_fieldnames[:] mode = "wt" # Open file and writer @@ -146,12 +158,14 @@ def __enter__(self) -> FlashLFQWriter: return self - def __exit__(self, *args, **kwargs) -> None: - self._open_file.close() + def __exit__(self, *args: Any, **kwargs: Any) -> None: + """Close file and writer.""" + if self._open_file is not None: + self._open_file.close() self._open_file = None self._writer = None - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """ Write a single PSM to new or existing PSM file. @@ -168,14 +182,14 @@ def write_psm(self, psm: PSM): entry = self._psm_to_entry(psm) try: - self._writer.writerow(entry) + self._writer.writerow(entry) # type: ignore[union-attr] except AttributeError as e: raise PSMUtilsIOException( f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" "is opened in context (i.e., using the `with` statement)." ) from e - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """ Write an entire PSMList to a new PSM file. @@ -188,8 +202,8 @@ def write_file(self, psm_list: PSMList): # Filter out decoys if self.only_targets: # Accept both None and False - target_mask = np.array([not psm.is_decoy for psm in psm_list]) - LOGGER.debug(f"Skipping {~target_mask.sum()} decoy PSMs for FlashLFQ file.") + target_mask = np.array([not psm.is_decoy for psm in psm_list], dtype=bool) + LOGGER.debug(f"Skipping {(~target_mask).sum()} decoy PSMs for FlashLFQ file.") else: target_mask = np.ones(len(psm_list), dtype=bool) @@ -198,15 +212,18 @@ def write_file(self, psm_list: PSMList): LOGGER.warning( "Not all PSMs have a q-value. Skipping FDR filtering for FlashLFQ file." ) - fdr_mask = np.ones(len(psm_list), dtype=bool) + fdr_mask: np.ndarray[Any, np.dtype[np.bool_]] = np.ones(len(psm_list), dtype=bool) else: fdr_mask = psm_list["qvalue"] <= self.fdr_threshold - filtered_by_fdr = (~fdr_mask & target_mask).sum() + filtered_by_fdr: int = (~fdr_mask & target_mask).sum() LOGGER.debug(f"Skipping {filtered_by_fdr} PSMs above FDR threshold for FlashLFQ file.") - filtered_psm_list = psm_list[target_mask & fdr_mask] + filtered_psm_list: PSMList = psm_list[target_mask & fdr_mask] - with open(self.filename, "wt", newline="") as f: + with open(self.filename, "w", newline="") as f: + if not self.fieldnames: + # Set default fieldnames; avoiding mutation of class variable + self.fieldnames = self._default_fieldnames[:] writer = csv.DictWriter( f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" ) @@ -215,7 +232,7 @@ def write_file(self, psm_list: PSMList): writer.writerow(self._psm_to_entry(psm)) @staticmethod - def _psm_to_entry(psm: PSM) -> dict: + def _psm_to_entry(psm: PSM) -> dict[str, Any]: """Convert :py:class:`~psm_utils.psm.PSM` to FlashLFQ TSV entry.""" return { "File Name": psm.run, diff --git a/psm_utils/io/fragpipe.py b/psm_utils/io/fragpipe.py index fc07395..badbdc5 100644 --- a/psm_utils/io/fragpipe.py +++ b/psm_utils/io/fragpipe.py @@ -6,7 +6,6 @@ Notes ----- - - Decoy PSMs and q-values are not returned by FragPipe. """ @@ -15,8 +14,11 @@ import csv from abc import ABC +from collections.abc import Iterator from pathlib import Path -from typing import Iterable, Optional +from typing import Any, cast + +import pandas as pd from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -25,7 +27,7 @@ set_csv_field_size_limit() -RESCORING_FEATURES = [ +RESCORING_FEATURES: list[str] = [ "Peptide Length", "Retention", "Observed Mass", @@ -38,12 +40,17 @@ class FragPipeReader(ReaderBase, ABC): + """Reader for FragPipe TSV format.""" + + use_calibrated_mz: bool + _mz_key: str + def __init__( self, - filename, + filename: str | Path, use_calibrated_mz: bool = True, - *args, - **kwargs, + *args: Any, + **kwargs: Any, ) -> None: """ Reader for MSFragger ``psm.tsv`` file. @@ -55,22 +62,25 @@ def __init__( use_calibrated_mz Whether to use ``Calibrated Observed M/Z`` (true) or non-calibrated ``Observed m/z`` (false), by default True. + *args + Additional positional arguments passed to the base class. + **kwargs + Additional keyword arguments passed to the base class. """ super().__init__(filename, *args, **kwargs) - self.filename = filename self.use_calibrated_mz = use_calibrated_mz self._mz_key = "Calibrated Observed M/Z" if use_calibrated_mz else "Observed M/Z" - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with open(self.filename) as msms_in: reader = csv.DictReader(msms_in, delimiter="\t") for row in reader: yield self._get_peptide_spectrum_match(row) - def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM: """Parse a single PSM from a FragPipe PSM file.""" rescoring_features = {ft: psm_dict[ft] for ft in RESCORING_FEATURES if ft in psm_dict} @@ -98,7 +108,7 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: ) @staticmethod - def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> str: + def _parse_peptidoform(mod_peptide: str, peptide: str, charge: str | None) -> str: """Parse the peptidoform from the modified peptide, peptide, and charge columns.""" if mod_peptide: peptide = mod_peptide @@ -117,14 +127,14 @@ def _parse_peptidoform(mod_peptide: str, peptide: str, charge: Optional[str]) -> @staticmethod def _parse_spectrum_id(spectrum: str) -> str: - """Extract scan number from spectrum ID: ``(file name).(scan #).(scan #).(charge).``""" + """Extract scan number from spectrum ID: ``(file name).(scan #).(scan #).(charge).``.""" try: return spectrum.split(".")[-2] except IndexError: return spectrum @staticmethod - def _parse_protein_list(razor_protein: str, mapped_proteins) -> list[str]: + def _parse_protein_list(razor_protein: str, mapped_proteins: str | None) -> list[str]: """Combine razor protein and mapped proteins into a single list.""" if mapped_proteins: mapped_proteins_list = mapped_proteins.split(", ") @@ -144,11 +154,14 @@ def _parse_run(spectrum_file: str) -> str: return Path(spectrum_file).stem @classmethod - def from_dataframe(cls, dataframe) -> PSMList: - """Create a PSMList from a pandas DataFrame.""" + def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: + """Create a PSMList from a Pandas DataFrame.""" + # Create a temporary reader instance to access the parsing method + temp_reader = cls(filename="") + return PSMList( - ptm_list=[ - cls._get_peptide_spectrum_match(cls(""), entry) + psm_list=[ + temp_reader._get_peptide_spectrum_match(cast(dict[str, Any], entry)) for entry in dataframe.to_dict(orient="records") ] ) diff --git a/psm_utils/io/idxml.py b/psm_utils/io/idxml.py index c993d85..d4171fe 100644 --- a/psm_utils/io/idxml.py +++ b/psm_utils/io/idxml.py @@ -1,27 +1,27 @@ """ Interface with OpenMS idXML PSM files. - Notes ----- - * idXML supports multiple peptide hits (identifications) per spectrum. Each peptide hit is parsed as an individual :py:class:`~psm_utils.psm.PSM` object. """ + from __future__ import annotations import logging import re -from warnings import filterwarnings +from collections.abc import Iterator from pathlib import Path -from typing import Iterable, List, Tuple, Union +from typing import Any, cast +from warnings import filterwarnings -from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.peptidoform import Peptidoform from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.peptidoform import Peptidoform filterwarnings( "ignore", @@ -31,14 +31,24 @@ ) try: - import pyopenms as oms #noqa: E402 + import pyopenms as oms # type: ignore[import] + + _has_openms = True except ImportError: _has_openms = False -else: - _has_openms = True + oms = None # type: ignore[assignment] logger = logging.getLogger(__name__) +DEFAULT_SCORE_TYPE = "search_engine_score" +TARGET_DECOY_KEY = "target_decoy" +QVALUE_KEY = "q-value" +PEP_KEY = "PEP" +SPECTRUM_REFERENCE_KEY = "spectrum_reference" +ID_MERGE_INDEX_KEY = "id_merge_index" +SPECTRA_DATA_KEY = "spectra_data" +ION_MOBILITY_KEY = "IM" + # Patterns to match open and closed round/square brackets MOD_PATTERN = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)") MOD_PATTERN_NTERM = re.compile(r"^\.\[((?:[^][]+|\[(?:[^][]+|\[[^][]*\])*\])*)\]") @@ -47,7 +57,8 @@ # Extracted from the OpenMS PSMFeatureExtractor, which adds and manipulates features that will be given to percolator # https://github.com/OpenMS/OpenMS/blob/342f6524e76a2bab3dcb428ba2f4aa2d6bfe8483/src/topp/PSMFeatureExtractor.cpp RESCORING_FEATURE_LIST = [ - "isotope_error" "MS:1002049", # MSGFPlus unchanged RawScore + "isotope_error", + "MS:1002049", # MSGFPlus unchanged RawScore "MS:1002050", # MSGFPlus unchanged DeNovoScore "MSGF:ScoreRatio", "MSGF:Energy", @@ -62,8 +73,8 @@ "XTANDEM:hyperscore", "XTANDEM:deltascore", "MS:1001330", # expect_score - "hyperscore", # MSfragger - "nextscore", # MSfragger + "hyperscore", # MSFragger + "nextscore", # MSFragger "COMET:deltaCn", # recalculated deltaCn = (current_XCorr - 2nd_best_XCorr) / max(current_XCorr, 1) "COMET:deltaLCn", # deltaLCn = (current_XCorr - worst_XCorr) / max(current_XCorr, 1) "COMET:lnExpect", # log(E-value) @@ -76,7 +87,8 @@ "MASCOT:delta_score", # delta score based on mScore "CONCAT:lnEvalue", "CONCAT:deltaLnEvalue", - "SAGE:ln(-poisson)" "SAGE:ln(delta_best)", + "SAGE:ln(-poisson)", + "SAGE:ln(delta_best)", "SAGE:ln(delta_next)", "SAGE:ln(matched_intensity_pct)", "SAGE:longest_b", @@ -88,7 +100,14 @@ class IdXMLReader(ReaderBase): - def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: + """Reader for idXML files with comprehensive type safety and error handling.""" + + protein_ids: Any # list[oms.ProteinIdentification] + peptide_ids: Any # list[oms.PeptideIdentification] + user_params_metadata: list[str] + rescoring_features: list[str] + + def __init__(self, filename: Path | str, *args: Any, **kwargs: Any) -> None: """ Reader for idXML files. @@ -96,34 +115,50 @@ def __init__(self, filename: Union[Path, str], *args, **kwargs) -> None: ---------- filename: str, pathlib.Path Path to idXML file. + *args + Additional positional arguments passed to the base class. + **kwargs + Additional keyword arguments passed to the base class. Examples -------- >>> from psm_utils.io import IdXMLReader >>> reader = IdXMLReader("example.idXML") >>> psm_list = [psm for psm in reader] + """ super().__init__(filename, *args, **kwargs) if not _has_openms: raise ImportError("pyOpenMS is required to read idXML files") + self.protein_ids, self.peptide_ids = self._parse_idxml() self.user_params_metadata = self._get_userparams_metadata(self.peptide_ids[0].getHits()[0]) self.rescoring_features = self._get_rescoring_features(self.peptide_ids[0].getHits()[0]) - def __iter__(self) -> Iterable[PSM]: - """ - Iterate over file and return PSMs one-by-one. - """ + def __iter__(self) -> Iterator[PSM]: + """Iterate over file and return PSMs one-by-one.""" for peptide_id in self.peptide_ids: for peptide_hit in peptide_id.getHits(): yield self._parse_psm(self.protein_ids, peptide_id, peptide_hit) - def _parse_idxml(self) -> Tuple[oms.ProteinIdentification, oms.PeptideIdentification]: + def _parse_idxml(self) -> tuple[Any, Any]: """ Parse idXML using pyopenms and perform sanity checks to make sure the file is not empty. + + Returns + ------- + tuple of (Any, Any) + Tuple containing (ProteinIdentification, PeptideIdentification) lists + + Raises + ------ + IdXMLReaderEmptyListException + If the idXML file contains no data to parse + """ - protein_ids, peptide_ids = [], [] - oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) + protein_ids: Any = [] # list[oms.ProteinIdentification] + peptide_ids: Any = [] # list[oms.PeptideIdentification] + oms.IdXMLFile().load(str(self.filename), protein_ids, peptide_ids) # type: ignore if len(protein_ids) == 0: raise IdXMLReaderEmptyListException( @@ -145,11 +180,24 @@ def _parse_peptidoform(sequence: str, charge: int) -> str: """ Parse idXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`. + Parameters + ---------- + sequence + Peptide sequence in idXML format + charge + Precursor charge state + + Returns + ------- + str + Peptide sequence in Peptidoform format with charge + Notes ----- Implemented according to the documentation on `github.com/OpenMS/OpenMS `_ . The differentiation between square- and round bracket notation is removed after parsing. + """ sequence = MOD_PATTERN.sub(r"[\1]", sequence) if sequence[:2] == ".[": @@ -163,9 +211,9 @@ def _parse_peptidoform(sequence: str, charge: int) -> str: def _parse_psm( self, - protein_ids: oms.ProteinIdentification, - peptide_id: oms.PeptideIdentification, - peptide_hit: oms.PeptideHit, + protein_ids: Any, + peptide_id: Any, + peptide_hit: Any, ) -> PSM: """ Parse idXML :py:class:`~pyopenms.PeptideHit` to :py:class:`~psm_utils.psm.PSM`. @@ -173,6 +221,21 @@ def _parse_psm( Uses additional information from :py:class:`~pyopenms.ProteinIdentification` and :py:class:`~pyopenms.PeptideIdentification` to annotate parameters of the :py:class:`~psm_utils.psm.PSM` object. + + Parameters + ---------- + protein_ids + List of ProteinIdentification objects + peptide_id + PeptideIdentification object + peptide_hit + PeptideHit object + + Returns + ------- + PSM + Parsed PSM object with all available information + """ peptidoform = self._parse_peptidoform( peptide_hit.getSequence().toString(), peptide_hit.getCharge() @@ -184,17 +247,38 @@ def _parse_psm( "idxml:significance_threshold": str(peptide_id.getSignificanceThreshold()), } peptide_hit_metadata = { - key: peptide_hit.getMetaValue(key) for key in self.user_params_metadata + key: str(peptide_hit.getMetaValue(key)) + if peptide_hit.getMetaValue(key) is not None + else "" + for key in self.user_params_metadata } + + # Extract qvalue and pep if they exist + qvalue = None + if peptide_hit.metaValueExists(QVALUE_KEY): + try: + qvalue = float(peptide_hit.getMetaValue(QVALUE_KEY)) + except (ValueError, TypeError): + pass + + pep = None + if peptide_hit.metaValueExists(PEP_KEY): + try: + pep = float(peptide_hit.getMetaValue(PEP_KEY)) + except (ValueError, TypeError): + pass + return PSM( peptidoform=peptidoform, - spectrum_id=peptide_id.getMetaValue("spectrum_reference"), + spectrum_id=peptide_id.getMetaValue(SPECTRUM_REFERENCE_KEY), run=self._get_run(protein_ids, peptide_id), is_decoy=self._is_decoy(peptide_hit), score=peptide_hit.getScore(), + qvalue=qvalue, + pep=pep, precursor_mz=peptide_id.getMZ(), retention_time=peptide_id.getRT(), - ion_mobility=float(im) if (im := peptide_id.getMetaValue("IM")) is not None else None, + ion_mobility=self._get_ion_mobility(peptide_hit), protein_list=[ accession.decode() for accession in peptide_hit.extractProteinAccessionsSet() ], @@ -204,66 +288,91 @@ def _parse_psm( # to original sequence in writer provenance_data={str(peptidoform): peptide_hit.getSequence().toString()}, # Store metadata of PeptideIdentification and PeptideHit objects - metadata= {**peptide_id_metadata, **peptide_hit_metadata}, + metadata={**peptide_id_metadata, **peptide_hit_metadata}, rescoring_features={ - key: float(peptide_hit.getMetaValue(key)) for key in self.rescoring_features + key: float(peptide_hit.getMetaValue(key)) # type: ignore + for key in self.rescoring_features }, ) @staticmethod - def _get_run( - protein_ids: oms.ProteinIdentification, peptide_id: oms.PeptideIdentification - ) -> str: + def _get_run(protein_ids: Any, peptide_id: Any) -> str | None: """ Get run name from idXML using pyopenms. If the idXML file contains a merge index, use it to annotate the run name without file extension. """ - if peptide_id.metaValueExists("id_merge_index"): - run = Path( - protein_ids[0] - .getMetaValue("spectra_data")[peptide_id.getMetaValue("id_merge_index")] - .decode() - ).stem - elif protein_ids[0].metaValueExists("spectra_data"): - run = Path(protein_ids[0].getMetaValue("spectra_data")[0].decode()).stem + # Check if spectra_data is available + if not protein_ids[0].metaValueExists(SPECTRA_DATA_KEY): + return None + + spectra_data = cast(list[bytes], protein_ids[0].getMetaValue(SPECTRA_DATA_KEY)) + + # Determine index to use + if peptide_id.metaValueExists(ID_MERGE_INDEX_KEY): + index = cast(int, peptide_id.getMetaValue(ID_MERGE_INDEX_KEY)) else: - run = None + index = 0 - # Convert back to None value (see writer) - if run == "None": - run = None + # Extract run path + try: + run_path = Path(spectra_data[index].decode()).stem + except (IndexError, UnicodeDecodeError): + return None - return run + # Handle the special case where run path is the string "None" + return None if run_path == "None" else run_path + + @staticmethod + def _get_ion_mobility(peptide_hit: Any) -> float | None: + """ + Get ion mobility from PeptideHit. - def _get_userparams_metadata(self, peptide_hit: oms.PeptideHit) -> List[str]: + Parameters + ---------- + peptide_hit + PeptideHit object + + Returns + ------- + float or None + Ion mobility value or None if not available or invalid + + """ + if not peptide_hit.metaValueExists(ION_MOBILITY_KEY): + return None + + im_value = peptide_hit.getMetaValue(ION_MOBILITY_KEY) + try: + return float(im_value) # type: ignore[arg-type] + except (ValueError, TypeError): + return None + + def _get_userparams_metadata(self, peptide_hit: Any) -> list[str]: """Get list of string type UserParams attached to each PeptideHit.""" # Fill the key list with all the keys from the PeptideHit # Empty list is required for the Cython wrapper to work correctly - keys = [] + keys: list[bytes] = [] peptide_hit.getKeys(keys) - keys = [ + + return [ key.decode() for key in keys if not self._is_float(peptide_hit.getMetaValue(key.decode())) ] - return keys - def _get_rescoring_features(self, peptide_hit: oms.PeptideHit) -> List[str]: + def _get_rescoring_features(self, peptide_hit: Any) -> list[str]: """Get list of rescoring features in UserParams attached to each PeptideHit.""" - keys = [] + keys: list[bytes] = [] peptide_hit.getKeys(keys) - keys = [ - key.decode() - for key in keys - if self._is_float(peptide_hit.getMetaValue(key.decode())) - and key.decode() in RESCORING_FEATURE_LIST + + return [ + key.decode() for key in keys if self._is_float(peptide_hit.getMetaValue(key.decode())) ] - return keys @staticmethod - def _is_float(element: any) -> bool: + def _is_float(element: Any) -> bool: """Check if element can be coerced to a float.""" if element is None: return False @@ -274,22 +383,27 @@ def _is_float(element: any) -> bool: return False @staticmethod - def _is_decoy(peptide_hit: oms.PeptideHit) -> bool: + def _is_decoy(peptide_hit: Any) -> bool | None: """Check if PSM is target or decoy.""" - if peptide_hit.metaValueExists("target_decoy"): - return peptide_hit.getMetaValue("target_decoy") == "decoy" + if peptide_hit.metaValueExists(TARGET_DECOY_KEY): + return peptide_hit.getMetaValue(TARGET_DECOY_KEY) == "decoy" else: return None class IdXMLWriter(WriterBase): + """Writer for idXML files with comprehensive error handling.""" + + protein_ids: Any | None + peptide_ids: Any | None + def __init__( self, - filename: Union[str, Path], - protein_ids=None, - peptide_ids=None, - *args, - **kwargs, + filename: str | Path, + *args: Any, + protein_ids: Any | None = None, + peptide_ids: Any | None = None, + **kwargs: Any, ) -> None: """ Writer for idXML files. @@ -298,10 +412,14 @@ def __init__( ---------- filename Path to PSM file. + *args + Additional positional arguments passed to the base class. protein_ids - Optional :py:class:`~pyopenms.ProteinIdentification` object to be written to the idXML file. + Optional list of :py:class:`~pyopenms.ProteinIdentification` objects to be written to the idXML file. peptide_ids - Optional :py:class:`~pyopenms.PeptideIdentification` object to be written to the idXML file. + Optional list of :py:class:`~pyopenms.PeptideIdentification` objects to be written to the idXML file. + **kwargs + Additional keyword arguments passed to the base class. Notes ----- @@ -335,14 +453,16 @@ def __init__( super().__init__(filename, *args, **kwargs) if not _has_openms: raise ImportError("pyOpenMS is required to write idXML files") + self.protein_ids = protein_ids self.peptide_ids = peptide_ids - self._writer = None def __enter__(self) -> IdXMLWriter: + """Open file for writing and return self.""" return self def __exit__(self, *args, **kwargs) -> None: + """Close file and writer.""" pass def write_psm(self, psm: PSM): @@ -351,6 +471,11 @@ def write_psm(self, psm: PSM): This method is currently not supported (see Notes). + Parameters + ---------- + psm + PSM object to write + Raises ------ NotImplementedError @@ -366,6 +491,11 @@ def write_file(self, psm_list: PSMList) -> None: If `self.protein_ids` and `self.peptide_ids` are not None, the PSM list scores, ranks, and rescoring features will first be merged with the existing IDs from those objects. + Parameters + ---------- + psm_list + List of PSM objects to write to file + """ psm_dict = psm_list.get_psm_dict() @@ -381,7 +511,9 @@ def write_file(self, psm_list: PSMList) -> None: else: self._create_new_ids(psm_dict) - def _update_existing_ids(self, psm_dict: dict) -> None: + def _update_existing_ids( + self, psm_dict: dict[str | None, dict[str, dict[str, list[PSM]]]] + ) -> None: """ Update an existing idXML file with info from the PSM list or write a new one. @@ -389,20 +521,27 @@ def _update_existing_ids(self, psm_dict: dict) -> None: :py:class:`~pyopenms.PeptideIdentification` objects with new features from the PSMList or create new ones. """ + if not self.protein_ids or not self.peptide_ids: + raise IdXMLException( + "Both protein_ids and peptide_ids must be provided to update existing idXML." + ) # Access run name(s) from ProteinIdentification spectrum_files = [ - Path(run.decode()).stem for run in self.protein_ids[0].getMetaValue("spectra_data") + Path(run.decode()).stem + for run in cast(list[bytes], self.protein_ids[0].getMetaValue(SPECTRA_DATA_KEY)) ] for peptide_id in self.peptide_ids: if len(spectrum_files) > 1: - run = spectrum_files[peptide_id.getMetaValue("id_merge_index")] + id_merge_index = cast(int, peptide_id.getMetaValue(ID_MERGE_INDEX_KEY)) + run = spectrum_files[id_merge_index] else: run = spectrum_files[0] # Get PSM objects associated from runs since we are writing a merged idXML # NOTE: Collections with multiple protein_ids and peptide_ids is not supported try: - psms = psm_dict[None][run][peptide_id.getMetaValue("spectrum_reference")] + spectrum_ref = cast(str, peptide_id.getMetaValue(SPECTRUM_REFERENCE_KEY)) + psms = psm_dict[None][run][spectrum_ref] except KeyError as e: raise IdXMLException( "Multiple collections are not supported when parsing single pyopenms protein " @@ -410,7 +549,11 @@ def _update_existing_ids(self, psm_dict: dict) -> None: ) from e # Dict of UNIMOD peptide sequence and PSM object - hit_dict = {psm.provenance_data[str(psm.peptidoform)]: psm for psm in psms} + hit_dict = { + (psm.provenance_data or {})[str(psm.peptidoform)]: psm + for psm in psms + if psm.provenance_data and str(psm.peptidoform) in psm.provenance_data + } # Update PeptideHits according to the PSM objects updated_peptide_hits = [] for peptide_hit in peptide_id.getHits(): @@ -421,112 +564,154 @@ def _update_existing_ids(self, psm_dict: dict) -> None: peptide_id.setHits(updated_peptide_hits) - oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) + oms.IdXMLFile().store(str(self.filename), self.protein_ids, self.peptide_ids) # type: ignore - def _update_peptide_hit(self, peptide_hit: oms.PeptideHit, psm: PSM) -> None: - """ - Inplace update of :py:class:`~pyopenms.PeptideHit` with novel predicted features - information from :py:class:`~psm_utils.psm.PSM`. - """ + def _update_peptide_hit(self, peptide_hit: Any, psm: PSM) -> None: + """Inplace update of PeptideHit with novel predicted features information from PSM.""" + # Update core PSM attributes if psm.score is not None: peptide_hit.setScore(psm.score) if psm.rank is not None: peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based if psm.qvalue is not None: - peptide_hit.setMetaValue("q-value", psm.qvalue) + peptide_hit.setMetaValue(QVALUE_KEY, psm.qvalue) if psm.pep is not None: - peptide_hit.setMetaValue("PEP", psm.pep) + peptide_hit.setMetaValue(PEP_KEY, psm.pep) - for feature, value in psm.rescoring_features.items(): - if feature not in RESCORING_FEATURE_LIST: - # Convert numpy objects to floats since pyopenms does not support numpy objects to be added + # Add rescoring features + if psm.rescoring_features: + for feature, value in psm.rescoring_features.items(): + # Convert numpy objects to floats as pyopenms does not support numpy objects peptide_hit.setMetaValue(feature, float(value)) - def _create_new_ids(self, psm_dict: dict) -> None: - """ - Create new ProteinIdentification and PeptideIdentification objects with new features from - the PSMList. - """ + def _create_new_ids(self, psm_dict: dict[str | None, dict[str, dict[str, list[PSM]]]]) -> None: + """Create new ProteinIdentification and PeptideIdentification objects with new features.""" for collection, runs in psm_dict.items(): - self.protein_ids = oms.ProteinIdentification() - self.peptide_ids = [] - - # Set msrun filename with spectra_data meta value - msrun_reference = [str(run).encode() for run in runs.keys()] - self.protein_ids.setMetaValue("spectra_data", msrun_reference) - - protein_list = [] - for run, psm_dict_run in runs.items(): - for spectrum_id, psms in psm_dict_run.items(): - protein_list.append( - [accession for psm in psms for accession in psm.protein_list] - ) - - # Fill PeptideIdentification object with PeptideHits - peptide_id = oms.PeptideIdentification() - peptide_id.setMetaValue("spectrum_reference", spectrum_id) - peptide_id.setMetaValue("id_merge_index", msrun_reference.index(str(run).encode())) - if psms[0].score is not None: - peptide_id.setScoreType("search_engine_score") - if psms[0].precursor_mz is not None: - peptide_id.setMZ(psms[0].precursor_mz) - if psms[0].retention_time is not None: - peptide_id.setRT(psms[0].retention_time) - - # Fill PeptideHits object - peptide_hits = [] - for psm in psms: - peptide_hit = oms.PeptideHit() - peptide_hit.setSequence( - oms.AASequence.fromString( - self._convert_proforma_to_unimod(psm.peptidoform) - ) - ) - peptide_hit.setCharge(psm.peptidoform.precursor_charge) - peptide_hit.setMetaValue( - "target_decoy", - "" - if psm.is_decoy is None - else ("decoy" if psm.is_decoy else "target"), - ) - if psm.qvalue is not None: - peptide_hit.setMetaValue("q-value", psm.qvalue) - if psm.pep is not None: - peptide_hit.setMetaValue("PEP", psm.pep) - if psm.rank is not None: - peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based - self._add_meta_values_from_dict(peptide_hit, psm.metadata) - self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) - self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) - - if psm.protein_list is not None: - for protein in psm.protein_list: - peptide_evidence = oms.PeptideEvidence() - peptide_evidence.setProteinAccession(protein) - peptide_hit.addPeptideEvidence(peptide_evidence) - - peptide_hits.append(peptide_hit) - - peptide_id.setHits(peptide_hits) - self.peptide_ids.append(peptide_id) - - # Get unique protein accessions - protein_list = list( - set([accession for proteins in protein_list for accession in proteins]) - ) - protein_hits = [] - for accession in protein_list: - protein_hit = oms.ProteinHit() - protein_hit.setAccession(accession) - protein_hits.append(protein_hit) - self.protein_ids.setHits(protein_hits) - - # Write an idXML file for each collection - oms.IdXMLFile().store( - "/".join(filter(None, [collection, str(self.filename)])), - [self.protein_ids], - self.peptide_ids, + self._create_ids_for_collection(collection, runs) + + def _create_ids_for_collection( + self, collection: str | None, runs: dict[str, dict[str, list[PSM]]] + ) -> None: + """Create ProteinIdentification and PeptideIdentification objects for a single collection.""" + self.protein_ids = [oms.ProteinIdentification()] # type: ignore + self.peptide_ids = [] + + # Set msrun filename with spectra_data meta value + msrun_reference = [str(run).encode() for run in runs.keys()] + self.protein_ids[0].setMetaValue(SPECTRA_DATA_KEY, msrun_reference) + + protein_list: list[list[str]] = [] + + for run, psm_dict_run in runs.items(): + for spectrum_id, psms in psm_dict_run.items(): + # Collect protein accessions + protein_list.append( + [accession for psm in psms for accession in (psm.protein_list or [])] + ) + + # Create PeptideIdentification + peptide_id = self._create_peptide_identification( + spectrum_id, run, msrun_reference, psms + ) + + # Create PeptideHits + peptide_hits = [self._create_peptide_hit(psm) for psm in psms] + peptide_id.setHits(peptide_hits) + self.peptide_ids.append(peptide_id) + + # Create protein hits + self._create_protein_hits(protein_list) + + # Write idXML file + filename = "/".join(filter(None, [collection, str(self.filename)])) + oms.IdXMLFile().store(filename, self.protein_ids, self.peptide_ids) # type: ignore + + def _create_peptide_identification( + self, + spectrum_id: str, + run: str, + msrun_reference: list[bytes], + psms: list[PSM], + ) -> Any: + """Create a PeptideIdentification object for a spectrum.""" + peptide_id = oms.PeptideIdentification() # type: ignore + peptide_id.setMetaValue(SPECTRUM_REFERENCE_KEY, spectrum_id) + peptide_id.setMetaValue(ID_MERGE_INDEX_KEY, msrun_reference.index(str(run).encode())) + + # Set properties from first PSM + first_psm = psms[0] + if first_psm.score is not None: + peptide_id.setScoreType(DEFAULT_SCORE_TYPE) + if first_psm.precursor_mz is not None: + peptide_id.setMZ(first_psm.precursor_mz) + if first_psm.retention_time is not None: + peptide_id.setRT(first_psm.retention_time) + + return peptide_id + + def _create_peptide_hit(self, psm: PSM) -> Any: + """Create a PeptideHit object from a PSM.""" + peptide_hit = oms.PeptideHit() # type: ignore + + # Set sequence + peptide_hit.setSequence( + oms.AASequence.fromString( # type: ignore + self._convert_proforma_to_unimod(psm.peptidoform) ) + ) + + # Set charge + if psm.peptidoform.precursor_charge is not None: + peptide_hit.setCharge(psm.peptidoform.precursor_charge) + + # Set target/decoy information + target_decoy_value = ( + "" if psm.is_decoy is None else ("decoy" if psm.is_decoy else "target") + ) + peptide_hit.setMetaValue(TARGET_DECOY_KEY, target_decoy_value) + + # Set optional values + if psm.score is not None: + peptide_hit.setScore(psm.score) + if psm.qvalue is not None: + peptide_hit.setMetaValue(QVALUE_KEY, psm.qvalue) + if psm.pep is not None: + peptide_hit.setMetaValue(PEP_KEY, psm.pep) + if psm.rank is not None: + peptide_hit.setRank(psm.rank - 1) # 1-based to 0-based + + # Add metadata and features + if psm.metadata: + self._add_meta_values_from_dict(peptide_hit, psm.metadata) + if psm.provenance_data: + self._add_meta_values_from_dict(peptide_hit, psm.provenance_data) + if psm.rescoring_features: + self._add_meta_values_from_dict(peptide_hit, psm.rescoring_features) + + # Add protein evidence + if psm.protein_list is not None: + for protein in psm.protein_list: + peptide_evidence = oms.PeptideEvidence() # type: ignore + peptide_evidence.setProteinAccession(protein) + peptide_hit.addPeptideEvidence(peptide_evidence) + + return peptide_hit + + def _create_protein_hits(self, protein_list: list[list[str]]) -> None: + """Create protein hits from collected protein accessions.""" + # Get unique protein accessions + unique_proteins = list( + {accession for protein_sublist in protein_list for accession in protein_sublist} + ) + + protein_hits = [] + for accession in unique_proteins: + protein_hit = oms.ProteinHit() # type: ignore + protein_hit.setAccession(accession) + protein_hits.append(protein_hit) + + if self.protein_ids and len(self.protein_ids) > 0: + self.protein_ids[0].setHits(protein_hits) def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str: """Convert a peptidoform sequence in proforma notation to UNIMOD notation.""" @@ -546,23 +731,29 @@ def _convert_proforma_to_unimod(self, peptidoform: Peptidoform) -> str: return sequence - def _add_meta_values_from_dict(self, peptide_hit: oms.PeptideHit, d: dict) -> None: + def _add_meta_values_from_dict(self, peptide_hit: Any, d: dict[str, Any] | None) -> None: """Add meta values inplace to :py:class:`~pyopenms.PeptideHit` from a dictionary.""" - if d is not None: - for key, value in d.items(): - # Convert numpy objects to floats since pyopenms does not support numpy objects to be added - if not isinstance(value, str): + if d is None: + return + + for key, value in d.items(): + # Convert numpy objects to floats since pyopenms does not support numpy objects + if not isinstance(value, str): + try: value = float(value) - peptide_hit.setMetaValue(key, value) + except (ValueError, TypeError): + # Skip values that cannot be converted + continue + peptide_hit.setMetaValue(key, value) -class IdXMLException(PSMUtilsException): - """Exception in psm_utils.io.IdXML""" +class IdXMLException(PSMUtilsIOException): + """Exception in psm_utils.io.IdXML.""" pass -class IdXMLReaderEmptyListException(PSMUtilsException): - """Exception in psm_utils.io.IdXMLReader""" +class IdXMLReaderEmptyListException(PSMUtilsIOException): + """Exception in psm_utils.io.IdXMLReader.""" pass diff --git a/psm_utils/io/ionbot.py b/psm_utils/io/ionbot.py index 43ad511..79b0e9b 100644 --- a/psm_utils/io/ionbot.py +++ b/psm_utils/io/ionbot.py @@ -8,15 +8,14 @@ import csv import re +from collections.abc import Iterator from pathlib import Path -from typing import Dict, Iterable, Union from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.peptidoform import Peptidoform from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList -from psm_utils.io._utils import set_csv_field_size_limit set_csv_field_size_limit() @@ -36,6 +35,8 @@ class IonbotReader(ReaderBase): + """Reader for ionbot PSM files.""" + def __init__( self, filename: str | Path, @@ -47,12 +48,15 @@ def __init__( Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. Examples -------- - IonbotReader supports iteration: >>> from psm_utils.io.ionbot import IonbotReader @@ -70,68 +74,115 @@ def __init__( """ super().__init__(filename, *args, **kwargs) - self.filename = filename - def __iter__(self) -> Iterable[PSM]: - """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "rt") as open_file: + def __iter__(self) -> Iterator[PSM]: + """ + Iterate over file and return PSMs one-by-one. + + Yields + ------ + PSM + Individual PSM objects from the ionbot CSV file. + + Raises + ------ + FileNotFoundError + If the specified file does not exist. + csv.Error + If there are issues reading the CSV file. + InvalidIonbotModificationError + If modification parsing fails. + + """ + with open(self.filename) as open_file: reader = csv.DictReader(open_file, delimiter=",") for row in reader: yield self._get_peptide_spectrum_match(row) - def read_file(self) -> PSMList: - """Read full PSM file into a PSMList object.""" - return PSMList(psm_list=[psm for psm in self]) - - def _get_peptide_spectrum_match(self, psm_dict: Dict[str, str | float]) -> PSM: - return PSM( - peptidoform=self._parse_peptidoform( - psm_dict["matched_peptide"], - psm_dict["modifications"], - psm_dict["charge"], - ), - spectrum_id=psm_dict["spectrum_title"], - run=psm_dict["spectrum_file"], - is_decoy=( - True - if psm_dict["database"] == "D" - else False if psm_dict["database"] == "T" else None - ), - score=float(psm_dict["psm_score"]), - precursor_mz=float(psm_dict["m/z"]), - retention_time=float(psm_dict["observed_retention_time"]), - protein_list=psm_dict["proteins"].split("||"), - source="ionbot", - qvalue=float(psm_dict["q-value"]), - pep=float(psm_dict["PEP"]), - provenance_data=({"ionbot_filename": str(self.filename)}), - metadata={ - col: str(psm_dict[col]) for col in psm_dict.keys() if col not in REQUIRED_COLUMNS - }, - ) + def _get_peptide_spectrum_match(self, psm_dict: dict[str, str]) -> PSM: + """Convert a dictionary row from ionbot CSV to a PSM object.""" + try: + return PSM( + peptidoform=self._parse_peptidoform( + psm_dict["matched_peptide"], + psm_dict["modifications"], + psm_dict["charge"], + ), + spectrum_id=psm_dict["spectrum_title"], + run=psm_dict["spectrum_file"], + is_decoy=( + True + if psm_dict["database"] == "D" + else False + if psm_dict["database"] == "T" + else None + ), + score=float(psm_dict["psm_score"]), + precursor_mz=float(psm_dict["m/z"]), + retention_time=float(psm_dict["observed_retention_time"]), + protein_list=psm_dict["proteins"].split("||"), + source="ionbot", + qvalue=float(psm_dict["q-value"]), + pep=float(psm_dict["PEP"]), + provenance_data={"ionbot_filename": str(self.filename)}, + metadata={ + col: str(psm_dict[col]) + for col in psm_dict.keys() + if col not in REQUIRED_COLUMNS + }, + ) + except KeyError as e: + raise PSMUtilsIOException(f"Missing required column in ionbot file: {e}") from e + except ValueError as e: + raise PSMUtilsIOException(f"Error parsing numeric value in ionbot file: {e}") from e @staticmethod - def _parse_peptidoform( - peptide: str, modifications: str, charge: Union[str, int] - ) -> Peptidoform: + def _parse_peptidoform(peptide: str, modifications: str, charge: str | int) -> Peptidoform: """Parse peptide, modifications, and charge to Peptidoform.""" # Split peptide into list of amino acids with termini - peptide = peptide = [""] + list(peptide) + [""] + peptide_list: list[str] = [""] + list(peptide) + [""] # Add modifications - pattern = re.compile(r"^(?P\[\S*?\])?(?P.*?)(?P\[\S*?\])?$") - for position, label in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): - mod_match = pattern.search(label) - if mod_match.group("U"): - parsed_label = "U:" + mod_match.group("U")[1:-1] - else: - parsed_label = mod_match.group("mod") - peptide[int(position)] += f"[{parsed_label}]" + pattern: re.Pattern[str] = re.compile(r"^(?P\[\S*?\])?(?P.*?)(?P\[\S*?\])?$") + + if modifications: # Handle empty modifications string + mod_parts = modifications.split("|") + if len(mod_parts) % 2 != 0: + raise InvalidIonbotModificationError( + f"Invalid modification string format: '{modifications}'. " + "Expected even number of parts (position|label pairs)." + ) + + for position_str, label in zip(mod_parts[::2], mod_parts[1::2]): + mod_match = pattern.search(label) + if not mod_match: + raise InvalidIonbotModificationError( + f"Invalid modification format '{label}' at position {position_str} in " + f"'{modifications}'." + ) + + try: + position = int(position_str) + except ValueError as e: + raise InvalidIonbotModificationError( + f"Invalid position '{position_str}' in modifications '{modifications}'" + ) from e + + if position < 0 or position >= len(peptide_list): + raise InvalidIonbotModificationError( + f"Position {position} out of range for peptide '{peptide}' (length {len(peptide_list) - 2})" + ) + + if mod_match.group("U"): + parsed_label = "U:" + mod_match.group("U")[1:-1] + else: + parsed_label = mod_match.group("mod") + peptide_list[position] += f"[{parsed_label}]" # Add terminal modifications - peptide[0] = peptide[0] + "-" if peptide[0] else "" - peptide[-1] = "-" + peptide[-1] if peptide[-1] else "" - proforma_seq = "".join(peptide) + peptide_list[0] = peptide_list[0] + "-" if peptide_list[0] else "" + peptide_list[-1] = "-" + peptide_list[-1] if peptide_list[-1] else "" + proforma_seq = "".join(peptide_list) # Add charge state proforma_seq += f"/{charge}" @@ -140,6 +191,13 @@ def _parse_peptidoform( class InvalidIonbotModificationError(PSMUtilsIOException): - """Invalid Peptide Record modification.""" + """ + Exception raised when ionbot modification parsing fails. + + This exception is raised when: + - Modification format is invalid + - Position values are out of range + - Modification string structure is malformed + """ pass diff --git a/psm_utils/io/maxquant.py b/psm_utils/io/maxquant.py index f51aaf6..46f8907 100644 --- a/psm_utils/io/maxquant.py +++ b/psm_utils/io/maxquant.py @@ -5,6 +5,7 @@ import csv import logging import re +from collections.abc import Iterator, Sequence from itertools import compress from pathlib import Path @@ -12,9 +13,9 @@ from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase +from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.peptidoform import Peptidoform from psm_utils.psm import PSM -from psm_utils.io._utils import set_csv_field_size_limit set_csv_field_size_limit() @@ -44,19 +45,20 @@ def __init__( **kwargs, ) -> None: """ - Reader for MaxQuant msms.txt PSM files. + Initialize reader for MaxQuant msms.txt PSM files. Parameters ---------- - filename: str, pathlib.Path - Path to PSM file. - decoy_prefix: str, optional - Protein name prefix used to denote decoy protein entries. Default: - ``"DECOY_"``. + filename + Path to the MaxQuant msms.txt PSM file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. Examples -------- - :py:class:`MSMSReader` supports iteration: + MSMSReader supports iteration: >>> from psm_utils.io.maxquant import MSMSReader >>> for psm in MSMSReader("msms.txt"): @@ -66,20 +68,18 @@ def __init__( GANLGEMTNAGIPVPPGFC[+57.022]VTAEAYK ... - Or a full file can be read at once into a - :py:class:`~psm_utils.psm_list.PSMList` object: + Or a full file can be read at once into a :py:class:`~psm_utils.psm_list.PSMList` + object: >>> reader = MSMSReader("msms.txt") >>> psm_list = reader.read_file() """ - super().__init__(filename, *args, **kwargs) - self._validate_msms() - def __iter__(self): - """Iterate over file and return PSMs one-by-one""" + def __iter__(self) -> Iterator[PSM]: + """Iterate over file and return PSMs one-by-one.""" with open(self.filename) as msms_in: reader = csv.DictReader(msms_in, delimiter="\t") for psm_dict in reader: @@ -87,13 +87,16 @@ def __iter__(self): yield psm def _validate_msms(self) -> None: - with open(self.filename, "r") as msms_file: + """Validate that the msms.txt file contains required columns.""" + with open(self.filename) as msms_file: msms_reader = csv.DictReader(msms_file, delimiter="\t") self._evaluate_columns(msms_reader.fieldnames) @staticmethod - def _evaluate_columns(columns) -> bool: - """Case insensitive column evaluation msms file.""" + def _evaluate_columns(columns: Sequence[str] | None) -> None: + """Case insensitive column evaluation for msms file.""" + if columns is None: + raise MSMSParsingError("MSMS file does not contain any columns.") columns = list(map(lambda col: col.lower(), columns)) column_check = [True if col.lower() in columns else False for col in MSMS_REQUIRED_COLUMNS] if not all(column_check): @@ -101,11 +104,12 @@ def _evaluate_columns(columns) -> bool: f"Missing columns: {list(compress(MSMS_REQUIRED_COLUMNS, list(~np.array(column_check))))}" ) - def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: + def _get_peptide_spectrum_match(self, psm_dict: dict[str, str]) -> PSM: """Return a PSM object from MaxQuant msms.txt PSM file.""" - psm = PSM( - peptidoform=self._parse_peptidoform(psm_dict["Modified sequence"], psm_dict["Charge"]), + peptidoform=self._parse_peptidoform( + psm_dict["Modified sequence"], int(psm_dict["Charge"]) + ), spectrum_id=psm_dict["Scan number"], run=psm_dict["Raw file"], is_decoy=psm_dict["Reverse"] == "+", @@ -126,8 +130,7 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: @staticmethod def _parse_peptidoform(modified_seq: str, charge: int) -> Peptidoform: - """Parse modified sequence to :py:class:`~psm_utils.peptidoform.Peptidoform`.""" - + """Parse modified sequence to Peptidoform.""" # pattern to match open and closed round brackets pattern = re.compile(r"\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)") modified_seq = modified_seq.strip("_") diff --git a/psm_utils/io/msamanda.py b/psm_utils/io/msamanda.py index 67d9ccb..d329ac3 100644 --- a/psm_utils/io/msamanda.py +++ b/psm_utils/io/msamanda.py @@ -5,19 +5,20 @@ import csv import logging import re +from collections.abc import Iterator, Sequence from itertools import compress from pathlib import Path import numpy as np -from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase -from psm_utils.psm import PSM, Peptidoform from psm_utils.io._utils import set_csv_field_size_limit +from psm_utils.io.exceptions import PSMUtilsIOException +from psm_utils.psm import PSM, Peptidoform set_csv_field_size_limit() -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) # Minimal set of required columns @@ -49,18 +50,31 @@ class MSAmandaReader(ReaderBase): - """Reader for psm_utils TSV format.""" + """Reader for MS Amanda CSV result files.""" def __init__(self, filename: str | Path, *args, **kwargs) -> None: + """ + Initialize reader for MS Amanda CSV result files. + + Parameters + ---------- + filename + Path to the MS Amanda CSV file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. + + """ super().__init__(filename, *args, **kwargs) - self._present_columns = REQUIRED_COLUMNS.copy() - self._rescoring_feature_columns = [] - self._metadata_columns = [] - self._has_rank_column = None + self._present_columns: list[str] = REQUIRED_COLUMNS.copy() + self._rescoring_feature_columns: list[str] = [] + self._metadata_columns: list[str] = [] + self._has_rank_column: bool | None = None - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "rt") as open_file: + with open(self.filename) as open_file: if not next(open_file).startswith("#"): open_file.seek(0) reader = csv.DictReader(open_file, delimiter="\t") @@ -68,8 +82,11 @@ def __iter__(self): for psm_dict in reader: yield self._get_peptide_spectrum_match(psm_dict) - def _evaluate_columns(self, columns) -> bool: - """Column evaluation for MS Amanda file.""" + def _evaluate_columns(self, columns: Sequence[str] | None) -> None: + """Evaluate and validate columns from MS Amanda file header.""" + if columns is None: + raise MSAmandaParsingError("MS Amanda file does not contain any columns.") + # Check if required columns are present column_check = [True if col in columns else False for col in REQUIRED_COLUMNS] if not all(column_check): @@ -91,7 +108,7 @@ def _evaluate_columns(self, columns) -> bool: if col not in self._present_columns + self._rescoring_feature_columns ] - def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: + def _get_peptide_spectrum_match(self, psm_dict: dict[str, str]) -> PSM: """Return a PSM object from MS Amanda CSV PSM file.""" psm = PSM( peptidoform=self._parse_peptidoform( @@ -120,8 +137,8 @@ def _get_peptide_spectrum_match(self, psm_dict: dict[str, str | float]) -> PSM: return psm @staticmethod - def _parse_peptidoform(seq, modifications, charge): - "Parse MSAmanda sequence, modifications and charge to proforma sequence" + def _parse_peptidoform(seq: str, modifications: str, charge: str) -> Peptidoform: + """Parse MSAmanda sequence, modifications and charge to ProForma sequence.""" peptide = [""] + [aa.upper() for aa in seq] + [""] pattern = re.compile( r"(?:(?:(?P[A-Z])(?P\d+))|(?P[CN]-Term))\((?P[^|()]+)\|(?P[-0-9.]+)\|(?Pvariable|fixed)\);?" @@ -129,12 +146,12 @@ def _parse_peptidoform(seq, modifications, charge): for match in pattern.finditer(modifications): if match.group("term") == "N-Term": - peptide[0] = peptide[0] + f'[{match.group("mod_name")}]' + peptide[0] = peptide[0] + f"[{match.group('mod_name')}]" elif match.group("term") == "C-Term": - peptide[-1] = peptide[-1] + f'[{match.group("mod_name")}]' + peptide[-1] = peptide[-1] + f"[{match.group('mod_name')}]" if match.group("loc") is not None: peptide[int(match.group("loc"))] = ( - peptide[int(match.group("loc"))] + f'[{match.group("mod_name")}]' + peptide[int(match.group("loc"))] + f"[{match.group('mod_name')}]" ) peptide[0] = peptide[0] + "-" if peptide[0] else "" @@ -144,7 +161,7 @@ def _parse_peptidoform(seq, modifications, charge): return Peptidoform(proforma_seq + f"/{charge}") -class MSAmandaParsingError(PSMUtilsException): - """Error while parsing MS Amanda CSV PSM file.""" +class MSAmandaParsingError(PSMUtilsIOException): + """Error in parsing MS Amanda CSV file.""" pass diff --git a/psm_utils/io/mzid.py b/psm_utils/io/mzid.py index e10883b..9a5ca50 100644 --- a/psm_utils/io/mzid.py +++ b/psm_utils/io/mzid.py @@ -1,5 +1,5 @@ """ -Reader and writers for the HUPO-PSI mzIdentML format. +Reader and writer for HUPO-PSI mzIdentML format PSM files. See `psidev.info/mzidentml `_ for more info on the format. @@ -11,11 +11,12 @@ import logging import re import xml.etree.ElementTree as ET +from collections.abc import Iterator from pathlib import Path -from typing import Union +from typing import Any, cast -from psims.mzid import MzIdentMLWriter -from pyteomics import mzid, proforma +from psims.mzid import MzIdentMLWriter # type: ignore[import] +from pyteomics import mzid # type: ignore[import] from rich.progress import Progress from psm_utils import __version__ @@ -89,21 +90,28 @@ class MzidReader(ReaderBase): - def __init__(self, filename: str | Path, *args, score_key: str = None, **kwargs) -> None: + """Reader for HUPO-PSI mzIdentML format PSM files.""" + + def __init__( + self, filename: str | Path, *args: Any, score_key: str | None = None, **kwargs: Any + ) -> None: """ - Reader for mzIdentML PSM files. + Reader for HUPO-PSI mzIdentML format PSM files. Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. - score_key: str, optional + *args + Additional positional arguments passed to parent class. + score_key Name of the score metric to use as PSM score. If not provided, the score metric is inferred from the file if one of the child parameters of ``MS:1001143`` is present. + **kwargs + Additional keyword arguments passed to parent class. Examples -------- - MzidReader supports iteration: >>> from psm_utils.io.mzid import MzidReader @@ -134,31 +142,39 @@ def __init__(self, filename: str | Path, *args, score_key: str = None, **kwargs) super().__init__(filename, *args, **kwargs) self._non_metadata_keys = ["ContactRole", "passThreshold"] self._score_key = score_key - self._rt_key = None - self._spectrum_rt_key = None - self._qvalue_key = None - self._pep_key = None - self._im_key = None + self._rt_key: str | None = None + self._spectrum_rt_key: str | None = None + self._qvalue_key: str | None = None + self._pep_key: str | None = None + self._im_key: str | None = None self._source = self._infer_source() - def __iter__(self): - """Iterate over file and return PSMs one-by-one.""" - with mzid.read(str(self.filename)) as reader: + def __iter__(self) -> Iterator[PSM]: + """Iterate over mzIdentML file and return PSMs one-by-one.""" + with mzid.MzIdentML(str(self.filename)) as reader: + first_entry = next(reader) # Parse spectrum metadata - self._get_toplevel_non_metadata_keys(reader[0].keys()) + self._get_toplevel_non_metadata_keys(first_entry.keys()) # Parse PSM non-metadata keys, rt key and score key - self._get_non_metadata_keys(reader[0]["SpectrumIdentificationItem"][0].keys()) + self._get_non_metadata_keys(first_entry["SpectrumIdentificationItem"][0].keys()) + with mzid.MzIdentML(str(self.filename)) as reader: for spectrum in reader: # Parse spectrum metadata spectrum_id = spectrum["spectrumID"] - spectrum_title = ( - spectrum["spectrum title"] if "spectrum title" in spectrum else None + spectrum_title = spectrum.get("spectrum title") + run = Path(spectrum["location"]).stem if spectrum.get("location") else None + rt = ( + float(spectrum[self._spectrum_rt_key]) + if self._spectrum_rt_key and self._spectrum_rt_key in spectrum + else None + ) + ion_mobility = ( + float(spectrum[self._im_key]) + if self._im_key and self._im_key in spectrum + else None ) - run = Path(spectrum["location"]).stem if "location" in spectrum else None - rt = float(spectrum[self._spectrum_rt_key]) if self._spectrum_rt_key else None - ion_mobility = float(spectrum[self._im_key]) if self._im_key else None # Parse PSMs from spectrum for entry in spectrum["SpectrumIdentificationItem"]: @@ -167,24 +183,29 @@ def __iter__(self): ) @staticmethod - def _get_xml_namespace(root_tag): - """Get the namespace of the xml root.""" + def _get_xml_namespace(root_tag: str) -> str: + """Extract XML namespace from root tag.""" m = re.match(r"\{.*\}", root_tag) return m.group(0) if m else "" - def _infer_source(self): - """Get the source of the mzid file.""" + def _infer_source(self) -> str | None: + """Infer search engine source from mzIdentML file metadata.""" mzid_xml = ET.parse(self.filename) root = mzid_xml.getroot() name_space = self._get_xml_namespace(root.tag) + software = root.find(f".//{name_space}AnalysisSoftware") + if software is None: + return None try: - return root.find(f".//{name_space}AnalysisSoftware").attrib["name"] + return software.attrib["name"] except KeyError: return None @staticmethod - def _parse_peptidoform(seq: str, modification_list: list[dict], charge: Union[int, None]): - """Parse mzid sequence and modifications to Peptidoform.""" + def _parse_peptidoform( + seq: str, modification_list: list[dict[str, Any]], charge: int | None + ) -> Peptidoform: + """Parse mzIdentML sequence and modifications into Peptidoform object.""" peptide = [""] + list(seq) + [""] # Add modification labels @@ -203,51 +224,47 @@ def _parse_peptidoform(seq: str, modification_list: list[dict], charge: Union[in return Peptidoform(proforma_seq) @staticmethod - def _parse_peptide_evidence_ref(peptide_evidence_list: list[dict]): + def _parse_peptide_evidence_ref( + peptide_evidence_list: list[dict[str, Any]], + ) -> tuple[bool, list[str]]: """ - Parse PeptideEvidence list of PSM. - - Notes - ----- - If multiple PeptideEvidence entries are associated with the PSM, the PSM is only considered - a decoy entry if ALL PeptideEvidence entries are decoy entries. If a target PeptideEvidence - entry is present, it should get priority over decoy entries. In theory, no overlap between - target and decoy peptide sequence should be present in the search space, although this - might not have been filtered for by the search engine. + Parse PeptideEvidence list to determine decoy status and protein accessions. + PSM is considered decoy only if ALL PeptideEvidence entries are decoy. """ - isdecoy = all( - [entry["isDecoy"] if "isDecoy" in entry else None for entry in peptide_evidence_list] - ) - protein_list = [d["accession"] for d in peptide_evidence_list if "accession" in d.keys()] - return isdecoy, protein_list + is_decoy = all(entry.get("isDecoy", False) for entry in peptide_evidence_list) + protein_list = [ + accession + for d in peptide_evidence_list + if (accession := d.get("accession")) is not None + ] + return is_decoy, protein_list def _get_peptide_spectrum_match( self, spectrum_id: str, - spectrum_title: Union[str, None], - run: Union[str, None], - rt: Union[float, None], - ion_mobility: Union[float, None], - spectrum_identification_item: dict[str, str | float | list], + spectrum_title: str | None, + run: str | None, + rt: float | None, + ion_mobility: float | None, + spectrum_identification_item: dict[str, Any], ) -> PSM: - """Parse single mzid entry to :py:class:`~psm_utils.peptidoform.Peptidoform`.""" + """Parse single mzIdentML SpectrumIdentificationItem into PSM object.""" sii = spectrum_identification_item - try: - modifications = sii["Modification"] - except KeyError: - modifications = [] - sequence = sii["PeptideSequence"] - charge = sii["chargeState"] if "chargeState" in sii else None + modifications = cast(list[dict[str, Any]], sii.get("Modification", [])) + sequence = cast(str, sii["PeptideSequence"]) + charge_value = sii.get("chargeState") + charge = int(charge_value) if charge_value is not None else None peptidoform = self._parse_peptidoform(sequence, modifications, charge) - is_decoy, protein_list = self._parse_peptide_evidence_ref(sii["PeptideEvidenceRef"]) - try: - precursor_mz = sii["experimentalMassToCharge"] - except KeyError: - precursor_mz = None + is_decoy, protein_list = self._parse_peptide_evidence_ref( + cast(list[dict[str, Any]], sii["PeptideEvidenceRef"]) + ) + + precursor_mz_value = sii.get("experimentalMassToCharge") + precursor_mz = float(precursor_mz_value) if precursor_mz_value is not None else None # Override spectrum-level RT if present at PSM level - if self._rt_key: + if self._rt_key and self._rt_key in sii: rt = float(sii[self._rt_key]) metadata = {col: str(sii[col]) for col in sii.keys() if col not in self._non_metadata_keys} @@ -259,30 +276,45 @@ def _get_peptide_spectrum_match( else: psm_spectrum_id = spectrum_id - try: - score = sii[self._score_key] - except KeyError: - score = None + score = None + if self._score_key: + score_value = sii.get(self._score_key) + score = float(score_value) if score_value is not None else None + + # Calculate qvalue and pep with cleaner logic + qvalue = None + if self._qvalue_key: + qvalue_raw = sii.get(self._qvalue_key) + qvalue = float(qvalue_raw) if qvalue_raw is not None else None + + pep = None + if self._pep_key: + pep_raw = sii.get(self._pep_key) + pep = float(pep_raw) if pep_raw is not None else None + + rank_value = sii.get("rank") + rank = int(rank_value) if rank_value is not None else None + psm = PSM( peptidoform=peptidoform, spectrum_id=psm_spectrum_id, run=run, is_decoy=is_decoy, score=score, - qvalue=sii[self._qvalue_key] if self._qvalue_key else None, - pep=sii[self._pep_key] if self._pep_key else None, + qvalue=qvalue, + pep=pep, precursor_mz=precursor_mz, retention_time=rt, ion_mobility=ion_mobility, protein_list=protein_list, - rank=sii["rank"] if "rank" in sii else None, + rank=rank, source=self._source, provenance_data={"mzid_filename": str(self.filename)}, metadata=metadata, ) return psm - def _get_non_metadata_keys(self, keys: list): + def _get_non_metadata_keys(self, keys: list[str]) -> None: """Gather all the keys at PSM-level that should not be written to metadata.""" # All keys required to create PSM object default_keys = [ @@ -323,8 +355,8 @@ def _get_non_metadata_keys(self, keys: list): # Keys that are not necessary for metadata self._non_metadata_keys.extend(default_keys) - def _get_toplevel_non_metadata_keys(self, keys: list): - """Gather all keys at spectrum-level that should not be written to metadata.""" + def _get_toplevel_non_metadata_keys(self, keys: list[str]) -> None: + """Identify spectrum-level keys that should not be written to PSM metadata.""" # Check if RT is encoded in spectrum metadata for key in ["retention time", "scan start time"]: if key in keys: @@ -340,30 +372,29 @@ def _get_toplevel_non_metadata_keys(self, keys: list): break @staticmethod - def _infer_score_name(keys) -> str: - """Infer the score from the list of known PSM scores.""" + def _infer_score_name(keys: list[str]) -> str | None: + """Infer search engine score name from available PSM keys.""" lower_keys = {key.lower(): key for key in keys} for score in STANDARD_SEARCHENGINE_SCORES: if score in lower_keys: return lower_keys[score] + return None @staticmethod - def _infer_qvalue_name(keys) -> Union[str, None]: - """Infer the q-value term from the list of known terms.""" + def _infer_qvalue_name(keys: list[str]) -> str | None: + """Infer q-value field name from available PSM keys.""" for qvalue in Q_VALUE_TERMS: if qvalue in keys: return qvalue - else: - return None + return None @staticmethod - def _infer_pep_name(keys) -> Union[str, None]: - """Infer the PEP term from the list of known terms.""" + def _infer_pep_name(keys: list[str]) -> str | None: + """Infer PEP (Posterior Error Probability) field name from available PSM keys.""" for pep in PEP_TERMS: if pep in keys: return pep - else: - return None + return None class MzidWriter(WriterBase): @@ -372,10 +403,10 @@ class MzidWriter(WriterBase): def __init__( self, filename: str | Path, - *args, + *args: Any, show_progressbar: bool = False, - **kwargs, - ): + **kwargs: Any, + ) -> None: """ Writer for mzIdentML PSM files. @@ -383,8 +414,12 @@ def __init__( ---------- filename: str, Pathlib.Path Path to PSM file. + *args + Additional positional argument passed to parent class. show_progressbar: bool, optional Show progress bar for conversion process. (default: False) + **kwargs + Additional keyword arguments passed to parent class. Notes ----- @@ -404,9 +439,11 @@ def __init__( self._writer = None def __enter__(self) -> MzidWriter: + """Open file for writing and return self.""" return self def __exit__(self, *args, **kwargs) -> None: + """Close file and writer.""" pass def write_psm(self, psm: PSM): @@ -423,8 +460,8 @@ def write_psm(self, psm: PSM): """ raise NotImplementedError("MzidWriter currently does not support write_psm.") - def write_file(self, psm_list: PSMList): - """Write entire PSMList to mzid file.""" + def write_file(self, psm_list: PSMList) -> None: + """Write entire PSMList to mzIdentML file.""" file = open(self.filename, "wb") with Progress(disable=not self.show_progressbar) as progress: with MzIdentMLWriter(file, close=True) as writer: @@ -526,10 +563,12 @@ def write_file(self, psm_list: PSMList): ) @staticmethod - def _create_peptide_object(peptidoform): + def _create_peptide_object(peptidoform: Peptidoform) -> dict[str, Any]: """Create mzid peptide object from Peptidoform.""" - def parse_modifications(modifications: list[proforma.TagBase], location: int): + def parse_modifications( + modifications: list[Any] | None, location: int + ) -> list[dict[str, Any]]: modification_list = [] if modifications: for mod in modifications: @@ -537,15 +576,15 @@ def parse_modifications(modifications: list[proforma.TagBase], location: int): modification_list.append( { "location": location, - "name": mod.name, - "monoisotopic_mass_delta": mod.mass, + "name": mod.name, # type: ignore[attr-defined] + "monoisotopic_mass_delta": mod.mass, # type: ignore[attr-defined] } ) except AttributeError: modification_list.append( { "location": location, - "monoisotopic_mass_delta": mod.mass, + "monoisotopic_mass_delta": mod.mass, # type: ignore[attr-defined] } ) return modification_list @@ -554,9 +593,11 @@ def parse_modifications(modifications: list[proforma.TagBase], location: int): modifications = [] for loc, (aa, mods) in enumerate(peptidoform.parsed_sequence, start=1): modifications.extend(parse_modifications(mods, loc)) - modifications.extend(parse_modifications(peptidoform.properties["n_term"], 0)) + modifications.extend(parse_modifications(peptidoform.properties.get("n_term"), 0)) modifications.extend( - parse_modifications(peptidoform.properties["c_term"], len(peptidoform.sequence) + 1) + parse_modifications( + peptidoform.properties.get("c_term"), len(peptidoform.sequence) + 1 + ) ) peptide_object = { @@ -567,7 +608,7 @@ def parse_modifications(modifications: list[proforma.TagBase], location: int): return peptide_object - def _transform_search_database(self): + def _transform_search_database(self) -> dict[str, Any]: """Create mzid database object.""" # TODO: Create this and link with protein object when fasta file is provided return { @@ -579,15 +620,17 @@ def _transform_search_database(self): } @staticmethod - def _transform_spectra_data(spec_id_dict: dict): - """Get all the unique spectra data from PSMList spectrum id dict.""" - collection_run_id_dict = {} + def _transform_spectra_data( + spec_id_dict: dict[str, Any], + ) -> tuple[list[dict[str, Any]], dict[str, int]]: + """Get all unique spectra data from PSMList spectrum id dict.""" + collection_run_id_dict: dict[str, int] = {} spectra_data = [] i = 1 - for collection in spec_id_dict.keys(): - for run in spec_id_dict[collection].keys(): + for collection in spec_id_dict: + for run in spec_id_dict[collection]: collection_run_id = "/".join(filter(None, [collection, run])) - if collection_run_id not in collection_run_id_dict.keys(): + if collection_run_id not in collection_run_id_dict: collection_run_id_dict[collection_run_id] = i spectra_data_object = { "id": i, @@ -595,28 +638,30 @@ def _transform_spectra_data(spec_id_dict: dict): "spectrum_id_format": "multiple peak list nativeID format", # 'file_format': #TODO can we infer this? } - spectra_data.append(spectra_data_object) + spectra_data.append(spectra_data_object) + i += 1 return spectra_data, collection_run_id_dict @staticmethod - def _transform_spectrum_identification_item(candidate_psm): + def _transform_spectrum_identification_item( + candidate_psm: dict[str, Any], + ) -> list[dict[str, Any]]: """Create SpectrumIdentificationItem for each candidate PSM.""" peptide = candidate_psm["peptidoform"].proforma - if candidate_psm["metadata"]: - params = [{k: v} for k, v in candidate_psm["metadata"].items()] - else: - params = [] + params = list(candidate_psm["metadata"].items()) if candidate_psm.get("metadata") else [] + params = [{k: v} for k, v in params] for key, label, unit in [ ("retention_time", "retention time", "second"), ("qvalue", "PSM-level q-value", None), ("pep", "PSM-level local FDR", None), ]: - if candidate_psm[key]: + value = candidate_psm.get(key) + if value is not None: + param = {"name": label, "value": value} if unit: - params.append({"name": label, "value": candidate_psm[key], "unit_name": unit}) - else: - params.append({"name": label, "value": candidate_psm[key]}) + param["unit_name"] = unit + params.append(param) candidate_psm_dict = { "charge_state": candidate_psm["peptidoform"].precursor_charge, @@ -637,8 +682,10 @@ def _transform_spectrum_identification_item(candidate_psm): items.append(dict(candidate_psm_dict, **protein_specific_items)) return items - def _transform_spectrum_identification_result(self, spec_id, identified_psms, spectra_data_id): - """Create mzid SpectrumIdentificationResult object for PSMs that match the same spectrum.""" + def _transform_spectrum_identification_result( + self, spec_id: str, identified_psms: list[dict[str, Any]], spectra_data_id: int + ) -> dict[str, Any]: + """Create mzid SpectrumIdentificationResult object for spectrum PSMs.""" spectrum_id_result = { "id": f"SIR_{spec_id}", "spectrum_id": spec_id, diff --git a/psm_utils/io/parquet.py b/psm_utils/io/parquet.py index 755cfcc..f53bac1 100644 --- a/psm_utils/io/parquet.py +++ b/psm_utils/io/parquet.py @@ -9,11 +9,11 @@ from __future__ import annotations +from collections.abc import Iterator from pathlib import Path -from typing import Union -import pyarrow as pa -import pyarrow.parquet as pq +import pyarrow as pa # type: ignore[import-untyped] +import pyarrow.parquet as pq # type: ignore[import-untyped] from pydantic import ValidationError from psm_utils.io._base_classes import ReaderBase, WriterBase @@ -23,20 +23,27 @@ class ParquetReader(ReaderBase): - def __init__(self, path: Union[str, Path], *args, **kwargs): + """Reader for Parquet files.""" + + def __init__(self, filename: str | Path, *args, **kwargs) -> None: """ Reader for Parquet files. Parameters ---------- - path : Union[str, Path] + filename Path to the Parquet file. + *args + Additional positional arguments passed to the base class. + **kwargs + Additional keyword arguments passed to the base class. """ - self.path = path + super().__init__(filename, *args, **kwargs) - def __iter__(self): - with pq.ParquetFile(self.path) as reader: + def __iter__(self) -> Iterator[PSM]: + """Iterate over the Parquet file and return PSMs one-by-one.""" + with pq.ParquetFile(self.filename) as reader: for batch in reader.iter_batches(): for row in batch.to_pylist(): # Convert map columns (rendered as lists of tuples) to dictionaries @@ -52,39 +59,49 @@ def __iter__(self): class ParquetWriter(WriterBase): - def __init__(self, path: Union[str, Path], chunk_size: int = 1e6, *args, **kwargs): + """Writer for Parquet files.""" + + def __init__(self, filename: str | Path, *args, chunk_size: int = int(1e6), **kwargs): """ Writer for Parquet files. Parameters ---------- - path : Union[str, Path] + filename Path to the Parquet file. - chunk_size : int + *args + Additional positional arguments passed to the base class. + chunk_size Number of PSMs to write in a single batch. Default is 1e6. + **kwargs + Additional keyword arguments passed to the base class. """ - self.path = path + super().__init__(filename, *args, **kwargs) + self.chunk_size = chunk_size - self._writer = None - self._psm_cache = [] + self._writer: pq.ParquetWriter | None = None + self._psm_cache: list[dict] = [] - def __enter__(self): - self._writer = pq.ParquetWriter(self.path, schema=SCHEMA) + def __enter__(self) -> ParquetWriter: + """Open the Parquet writer in a context manager.""" + self._writer = pq.ParquetWriter(self.filename, schema=SCHEMA) return self - def __exit__(self, *args, **kwargs): - self._flush() - self._writer.close() + def __exit__(self, *args, **kwargs) -> None: + """Close the Parquet writer.""" + if self._writer is not None: + self._flush() + self._writer.close() - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """Write a single PSM to the Parquet file.""" self._psm_cache.append(self._psm_to_entry(psm)) if len(self._psm_cache) > self.chunk_size: self._flush() - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """Write a list of PSMs to the Parquet file.""" with self: for psm in psm_list: @@ -97,10 +114,12 @@ def _psm_to_entry(psm: PSM) -> dict: psm_dict["peptidoform"] = str(psm.peptidoform) return psm_dict - def _flush(self): + def _flush(self) -> None: """Write the cached PSMs to the Parquet file.""" if not self._psm_cache: return + if self._writer is None: + raise PSMUtilsIOException("ParquetWriter must be opened in a context manager.") table = pa.Table.from_pylist(self._psm_cache, schema=SCHEMA) self._writer.write_table(table) self._psm_cache = [] diff --git a/psm_utils/io/peptide_record.py b/psm_utils/io/peptide_record.py index 8afb97f..749b3f9 100644 --- a/psm_utils/io/peptide_record.py +++ b/psm_utils/io/peptide_record.py @@ -55,113 +55,78 @@ from __future__ import annotations import csv -from collections import namedtuple +from collections.abc import Iterator from pathlib import Path -from typing import Iterable, NamedTuple, Optional +from typing import Any, TextIO import pandas as pd +from pydantic import BaseModel, ConfigDict from psm_utils.io._base_classes import ReaderBase, WriterBase +from psm_utils.io._utils import set_csv_field_size_limit from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.peptidoform import Peptidoform from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io._utils import set_csv_field_size_limit set_csv_field_size_limit() -class _PeptideRecord: - """Helper class for handling Peptide Record files.""" - - required_columns = ["spec_id", "peptide", "modifications"] - optional_columns = [ - "charge", - "observed_retention_time", - "predicted_retention_time", - "label", - "score", - ] - - def __init__( - self, - filename: str | Path, - required_columns: list[str] = None, - optional_columns: list[str] = None, - ) -> None: - """ - Helper class for handling Peptide Record files. - - Upon initialization, the separator inferred and presence of required columns - is checked. +_REQUIRED_COLUMNS = ["spec_id", "peptide", "modifications"] +_OPTIONAL_COLUMNS = [ + "charge", + "observed_retention_time", + "predicted_retention_time", + "label", + "score", +] + + +def _analyze_peprec_file(filename: str | Path) -> tuple[str, list[str]]: + """Analyze Peptide Record file to determine separator and validate columns.""" + separator = "" + header: list[str] = [] + + with open(filename) as f: + line = f.readline().strip() + for sep in ["\t", ",", ";", " "]: + cols = line.split(sep) + if all(rc in cols for rc in _REQUIRED_COLUMNS): + separator = sep + header = cols + break + else: + raise InvalidPeprecError( + "Could not infer separator. Please validate the Peptide Record " + "header and/or the required columns." + ) - Parameters - ---------- - filename: str, pathlib.Path - Path to PSM file. - required_columns: list[str] - Override default columns. - optional_columns: list[str] - Override default columns. + # Validate required columns + for rc in _REQUIRED_COLUMNS: + if rc not in header: + raise InvalidPeprecError(f"Required column missing: `{rc}`") - Attributes - ---------- - separator: str - Separator (delimiter) used in Peptide Record file. - header: list[str] - Column names used in Peptide Record file. + return separator, header - Raises - ------ - InvalidPeprecError - If Peptide Record separator cannot be inferred from header. - """ - self.filename = filename - self.separator = None - self.header = None +class _PeprecEntry(BaseModel): + """Typed entry for Peptide Record data.""" - if required_columns: - self.required_columns = required_columns - else: - self.required_columns = self.required_columns.copy() # Copy from class - if optional_columns: - self.optional_columns = optional_columns - else: - self.optional_columns = self.optional_columns.copy() # Copy from class - - self._infer_separator() - self._validate_required_columns() - - def __repr__(self) -> str: - return f"_PeptideRecord('{self.filename}')" - - def _infer_separator(self) -> None: - """Infer separator used in Peptide Record file.""" - with open(self.filename, "rt") as f: - line = f.readline().strip() - for sep in ["\t", ",", ";", " "]: - cols = line.split(sep) - if all(rc in cols for rc in self.required_columns): - self.separator = sep - break - else: - raise InvalidPeprecError( - "Could not infer separator. Please validate the Peptide Record " - "header and/or the `required_columns` setting." - ) + spec_id: str + peptide: str + modifications: str + charge: str | None = None + observed_retention_time: str | None = None + predicted_retention_time: str | None = None + label: str | None = None + score: str | None = None - def _validate_required_columns(self) -> None: - """Raise InvalidPeprecError if not all required columns are present.""" - with open(self.filename, "rt") as f: - reader = csv.reader(f, delimiter=self.separator) - self.header = next(reader) - for rc in self.required_columns: - if rc not in self.header: - raise InvalidPeprecError(f"Required column missing: `{rc}`") + model_config = ConfigDict(extra="ignore") class PeptideRecordReader(ReaderBase): + """Reader for Peptide Record PSM files.""" + def __init__( self, filename: str | Path, @@ -173,12 +138,15 @@ def __init__( Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. Examples -------- - PeptideRecordReader supports iteration: >>> from psm_utils.io.peptide_record import PeptideRecordReader @@ -188,36 +156,30 @@ def __init__( AC[Carbamidomethyl]DEFGR [Acetyl]-AC[Carbamidomethyl]DEFGHIK - Or a full file can be read at once into a :py:class:`~psm_utils.psm_list.PSMList` - object: + Or a full file can be read at once into a :py:class:`~psm_utils.psm_list.PSMList` object: >>> peprec_reader = PeptideRecordReader("peprec.txt") >>> psm_list = peprec_reader.read_file() """ - super().__init__(filename, *args, **kwargs) - self._peprec = _PeptideRecord(self.filename) + self._separator, self._header = _analyze_peprec_file(self.filename) - # Define named tuple for single Peptide Record entries, based on - # configured columns - columns = self._peprec.required_columns + self._peprec.optional_columns - self.PeprecEntry = namedtuple("PeprecEntry", columns, defaults=[None for _ in columns]) - - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "rt") as open_file: - reader = csv.DictReader(open_file, delimiter=self._peprec.separator) + with open(self.filename) as open_file: + reader = csv.DictReader(open_file, delimiter=self._separator) for row in reader: - entry = self.PeprecEntry(**row) + entry = _PeprecEntry(**row) psm = self._entry_to_psm(entry, filename=self.filename) yield psm @staticmethod - def _entry_to_psm(entry: NamedTuple, filename: Optional[str] = None) -> PSM: - """Parse single Peptide Record entry to `PSM`.""" + def _entry_to_psm(entry: _PeprecEntry, filename: str | Path) -> PSM: + """Parse single Peptide Record entry to PSM.""" # Parse sequence and modifications - proforma = peprec_to_proforma(entry.peptide, entry.modifications, entry.charge) + charge = int(entry.charge) if entry.charge else None + proforma = peprec_to_proforma(entry.peptide, entry.modifications, charge) # Parse decoy label if entry.label: @@ -243,35 +205,43 @@ def _entry_to_psm(entry: NamedTuple, filename: Optional[str] = None) -> PSM: class PeptideRecordWriter(WriterBase): - def __init__(self, filename, *args, **kwargs): + """Writer for Peptide Record PSM files.""" + + def __init__(self, filename: str | Path, *args, **kwargs) -> None: """ Writer for Peptide Record PSM files. Parameters ---------- - filename: str, Path - Path to PSM file + filename + Path to PSM file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) - self._open_file = None - self._writer = None + self._open_file: TextIO | None = None + self._writer: csv.DictWriter | None = None def __enter__(self) -> PeptideRecordWriter: + """Open file for writing and prepare CSV writer.""" + # If file exists, analyze it to determine separator and header if Path(self.filename).is_file(): - peprec = _PeptideRecord(self.filename) - self._open_file = open(self.filename, "at", newline="") + separator, header = _analyze_peprec_file(self.filename) + self._open_file = open(self.filename, "a", newline="") self._writer = csv.DictWriter( self._open_file, - fieldnames=peprec.header, + fieldnames=header, extrasaction="ignore", - delimiter=peprec.separator, + delimiter=separator, ) else: - self._open_file = open(self.filename, "wt", newline="") + self._open_file = open(self.filename, "w", newline="") self._writer = csv.DictWriter( self._open_file, - fieldnames=_PeptideRecord.required_columns + _PeptideRecord.optional_columns, + fieldnames=_REQUIRED_COLUMNS + _OPTIONAL_COLUMNS, extrasaction="ignore", delimiter=" ", ) @@ -279,12 +249,15 @@ def __enter__(self) -> PeptideRecordWriter: return self def __exit__(self, *args, **kwargs) -> None: - self._open_file.close() + """Close file when exiting context.""" + if self._open_file is not None: + self._open_file.close() self._open_file = None self._writer = None @staticmethod - def _psm_to_entry(psm: PSM) -> dict: + def _psm_to_entry(psm: PSM) -> dict[str, Any]: + """Convert PSM to Peptide Record entry dictionary.""" sequence, modifications, charge = proforma_to_peprec(psm.peptidoform) return { "spec_id": psm.spectrum_id, @@ -296,77 +269,77 @@ def _psm_to_entry(psm: PSM) -> dict: "score": psm.score, } - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """ Write a single PSM to new or existing Peptide Record PSM file. Parameters ---------- - psm: PSM + psm PSM object to write. Examples -------- - To write single PSMs to a file, :py:class:`PeptideRecordWriter` must be opened - as a context manager. Then, within the context, :py:func:`write_psm` can be - called: + To write single PSMs to a file, :py:class:`PeptideRecordWriter` must be opened as + a context manager. Then, within the context, :py:func:`write_psm` can be called: >>> with PeptideRecordWriter("peprec.txt") as writer: >>> writer.write_psm(psm) """ + if self._writer is None: + raise PSMUtilsIOException( + f"`write_psm` method can only be called if `{self.__class__.__qualname__}` " + "is opened in context (i.e., using the `with` statement)." + ) entry = self._psm_to_entry(psm) try: self._writer.writerow(entry) except AttributeError as e: raise PSMUtilsIOException( - f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" + f"`write_psm` method can only be called if `{self.__class__.__qualname__}` " "is opened in context (i.e., using the `with` statement)." ) from e - # TODO: Support appending to existing file? - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """ Write an entire PSMList to a new Peptide Record PSM file. Parameters ---------- - psm_list: PSMList + psm_list PSMList object to write to file. Examples -------- - >>> writer = PeptideRecordWriter("peprec.txt") >>> writer.write_file(psm_list) """ - with open(self.filename, "wt", newline="") as f: - fieldnames = _PeptideRecord.required_columns + _PeptideRecord.optional_columns + with open(self.filename, "w", newline="") as f: + fieldnames = _REQUIRED_COLUMNS + _OPTIONAL_COLUMNS writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter=" ") writer.writeheader() for psm in psm_list: writer.writerow(self._psm_to_entry(psm)) -def peprec_to_proforma( - peptide: str, modifications: str, charge: Optional[int] = None -) -> Peptidoform: +def peprec_to_proforma(peptide: str, modifications: str, charge: int | None = None) -> Peptidoform: """ Convert Peptide Record notation to :py:class:`~psm_utils.peptidoform.Peptidoform`. Parameters ---------- - peptide: str + peptide Stripped peptide sequence. - modifications: str + modifications Modifications in Peptide Record notation (e.g., ``4|Oxidation``) - charge: int, optional + charge Precursor charge state Returns ------- - peptidoform: psm_utils.peptidoform.Peptidoform + peptidoform Peptidoform Raises @@ -376,26 +349,26 @@ def peprec_to_proforma( """ # List of peptide sequence with added terminal positions - peptide = [""] + list(peptide) + [""] + peptide_list = [""] + list(peptide) + [""] # Add modification labels for position, label in zip(modifications.split("|")[::2], modifications.split("|")[1::2]): try: - peptide[int(position)] += f"[{label}]" + peptide_list[int(position)] += f"[{label}]" except ValueError as e: raise InvalidPeprecModificationError( f"Could not parse PEPREC modification `{modifications}`." ) from e except IndexError as e: raise InvalidPeprecModificationError( - f"PEPREC modification has invalid position {position} in " - f"peptide `{''.join(peptide)}`." + f"PEPREC modification has invalid position {position} in peptide " + f"`{''.join(peptide_list)}`." ) from e # Add dashes between residues and termini, and join sequence - peptide[0] = peptide[0] + "-" if peptide[0] else "" - peptide[-1] = "-" + peptide[-1] if peptide[-1] else "" - proforma_seq = "".join(peptide) + peptide_list[0] = peptide_list[0] + "-" if peptide_list[0] else "" + peptide_list[-1] = "-" + peptide_list[-1] if peptide_list[-1] else "" + proforma_seq = "".join(peptide_list) # Add charge state if charge: @@ -404,21 +377,22 @@ def peprec_to_proforma( return Peptidoform(proforma_seq) -def proforma_to_peprec(peptidoform: Peptidoform) -> tuple(str, str, Optional[int]): +def proforma_to_peprec(peptidoform: Peptidoform) -> tuple[str, str, int | None]: """ Convert :py:class:`~psm_utils.peptidoform.Peptidoform` to Peptide Record notation. Parameters ---------- - peptidoform: psm_utils.peptidoform.Peptidoform + peptidoform + Input peptidoform object. Returns ------- - peptide: str + peptide Stripped peptide sequence - modifications: str + modifications Modifications in Peptide Record notation - charge: int, optional + charge Precursor charge state, if available, else :py:const:`None` Notes @@ -430,8 +404,8 @@ def proforma_to_peprec(peptidoform: Peptidoform) -> tuple(str, str, Optional[int """ - def _mod_to_ms2pip(mod_list: list, location: int): - """Proforma modification site (list) to MS²PIP modification.""" + def _mod_to_ms2pip(mod_list: list, location: int) -> str: + """Convert Proforma modification site to MS²PIP modification.""" if len(mod_list) > 1: raise InvalidPeprecModificationError( "Multiple modifications per site not supported in Peptide Record format." @@ -463,23 +437,20 @@ def from_dataframe(peprec_df: pd.DataFrame) -> PSMList: Parameters ---------- - peprec_df: pandas.DataFrame + peprec_df Peptide Record DataFrame Returns ------- - psm_list: PSMList + psm_list PSMList object """ - PeprecEntry = namedtuple( - "PeprecEntry", peprec_df.columns, defaults=[None for _ in peprec_df.columns] - ) psm_list = [] for _, row in peprec_df.iterrows(): - entry = PeprecEntry(**row) - psm_list.append(PeptideRecordReader._entry_to_psm(entry)) - return PSMList(psm_list) + entry = _PeprecEntry(**row.to_dict()) + psm_list.append(PeptideRecordReader._entry_to_psm(entry, filename="")) + return PSMList(psm_list=psm_list) def to_dataframe(psm_list: PSMList) -> pd.DataFrame: @@ -488,15 +459,16 @@ def to_dataframe(psm_list: PSMList) -> pd.DataFrame: Parameters ---------- - psm_list: PSMList + psm_list + Input PSMList object. Returns ------- pd.DataFrame + Peptide Record DataFrame. Examples -------- - >>> psm_list = PeptideRecordReader("peprec.csv").read_file() >>> psm_utils.io.peptide_record.to_dataframe(psm_list) spec_id peptide modifications charge label ... diff --git a/psm_utils/io/pepxml.py b/psm_utils/io/pepxml.py index 42e5b99..56cdbab 100644 --- a/psm_utils/io/pepxml.py +++ b/psm_utils/io/pepxml.py @@ -4,10 +4,11 @@ import logging from collections import defaultdict +from collections.abc import Iterator from pathlib import Path -from typing import List, Optional, Union +from typing import Any -from pyteomics import mass, pepxml, proforma +from pyteomics import mass, pepxml, proforma # type: ignore[import] from psm_utils.io._base_classes import ReaderBase from psm_utils.peptidoform import Peptidoform @@ -28,23 +29,31 @@ class PepXMLReader(ReaderBase): - def __init__(self, filename: Union[str, Path], *args, score_key: str = None, **kwargs) -> None: + """Reader for pepXML PSM files.""" + + def __init__( + self, filename: str | Path, *args: Any, score_key: str | None = None, **kwargs: Any + ) -> None: """ Reader for pepXML PSM files. Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. - score_key: str, optional + *args + Additional positional arguments passed to parent class. + score_key Name of the score metric to use as PSM score. If not provided, the score metric is inferred from a list of known search engine scores. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) self.score_key = score_key or self._infer_score_name() - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with pepxml.read(str(self.filename)) as reader: for spectrum_query in reader: @@ -53,33 +62,39 @@ def __iter__(self): for search_hit in spectrum_query["search_hit"]: yield self._parse_psm(spectrum_query, search_hit) - def _infer_score_name(self) -> str: + def _infer_score_name(self) -> str | None: """Infer the score from the list of known PSM scores.""" # Get scores from first PSM with pepxml.read(str(self.filename)) as reader: for spectrum_query in reader: score_keys = spectrum_query["search_hit"][0]["search_score"].keys() break + else: + score_keys = [] # Infer score name if not score_keys: logger.warning("No pepXML scores found.") return None - else: - for score in STANDARD_SEARCHENGINE_SCORES: # Check for known scores - if score in score_keys: - logger.debug(f"Using known pepXML score `{score}`.") - return score - else: - logger.warning(f"No known pepXML scores found. Defaulting to `{score_keys[0]}`.") - return score_keys[0] # Default to the first one if nothing found + + for score in STANDARD_SEARCHENGINE_SCORES: # Check for known scores + if score in score_keys: + logger.debug(f"Using known pepXML score `{score}`.") + return score + + # Default to the first one if nothing found + logger.warning(f"No known pepXML scores found. Defaulting to `{score_keys[0]}`.") + return score_keys[0] @staticmethod - def _parse_peptidoform(peptide: str, modifications: List[dict], charge: Optional[int] = None): - """Parse pepXML peptide to :py:class:`~psm_utils.peptidoform.Peptidoform`.""" - modifications_dict = defaultdict(list) - n_term = [] - c_term = [] + def _parse_peptidoform( + peptide: str, modifications: list[dict[str, Any]], charge: int | None = None + ) -> Peptidoform: + """Parse pepXML peptide to Peptidoform.""" + modifications_dict: dict[int, list[Any]] = defaultdict(list) + n_term: list[Any] = [] + c_term: list[Any] = [] + for mod in modifications: # Round mass modification to 6 decimal places, precision from UniMod if mod["position"] == 0: @@ -110,8 +125,22 @@ def _parse_peptidoform(peptide: str, modifications: List[dict], charge: Optional } return Peptidoform(proforma.ProForma(sequence, properties)) - def _parse_psm(self, spectrum_query: dict, search_hit: dict) -> PSM: - """Parse pepXML PSM to :py:class:`~psm_utils.psm.PSM`.""" + def _parse_psm(self, spectrum_query: dict[str, Any], search_hit: dict[str, Any]) -> PSM: + """Parse pepXML PSM to PSM.""" + # Build metadata dictionary properly + metadata = { + "num_matched_ions": str(search_hit["num_matched_ions"]), + "tot_num_ions": str(search_hit["tot_num_ions"]), + "num_missed_cleavages": str(search_hit["num_missed_cleavages"]), + } + # Add search scores to metadata + metadata.update( + { + f"search_score_{key.lower()}": str(search_hit["search_score"][key]) + for key in search_hit["search_score"] + } + ) + return PSM( peptidoform=self._parse_peptidoform( search_hit["peptide"], @@ -129,12 +158,8 @@ def _parse_psm(self, spectrum_query: dict, search_hit: dict) -> PSM: precursor_mz=mass_to_mz( spectrum_query["precursor_neutral_mass"], spectrum_query["assumed_charge"] ), - retention_time=spectrum_query["retention_time_sec"] - if "retention_time_sec" in spectrum_query - else None, - ion_mobility=spectrum_query["ion_mobility"] - if "ion_mobility" in spectrum_query - else None, + retention_time=spectrum_query.get("retention_time_sec"), + ion_mobility=spectrum_query.get("ion_mobility"), protein_list=[p["protein"] for p in search_hit["proteins"]], rank=search_hit["hit_rank"], source=None, @@ -143,15 +168,6 @@ def _parse_psm(self, spectrum_query: dict, search_hit: dict) -> PSM: "start_scan": str(spectrum_query["start_scan"]), "end_scan": str(spectrum_query["end_scan"]), }, - metadata={ - "num_matched_ions": str(search_hit["num_matched_ions"]), - "tot_num_ions": str(search_hit["tot_num_ions"]), - "num_missed_cleavages": str(search_hit["num_missed_cleavages"]), - }.update( - { - f"search_score_{key.lower()}": str(search_hit["search_score"][key]) - for key in search_hit["search_score"] - } - ), - rescoring_features=dict(), + metadata=metadata, + rescoring_features={}, ) diff --git a/psm_utils/io/percolator.py b/psm_utils/io/percolator.py index 045d09c..07919ad 100644 --- a/psm_utils/io/percolator.py +++ b/psm_utils/io/percolator.py @@ -6,7 +6,6 @@ Notes ----- - * While :py:class:`PercolatorTabReader` supports reading the peptide notation with preceding and following amino acids (e.g. ``R.ACDEK.F``), these amino acids are not stored and are not written by :py:class:`PercolatorTabWriter`. @@ -18,8 +17,9 @@ import csv import logging import re +from collections.abc import Iterator from pathlib import Path -from typing import Iterable, List, Optional, Tuple, Union +from typing import Any from psm_utils.io._base_classes import ReaderBase, WriterBase from psm_utils.io._utils import set_csv_field_size_limit @@ -33,14 +33,16 @@ class PercolatorTabReader(ReaderBase): + """Reader for Percolator Tab PIN/POUT format.""" + def __init__( self, filename: str | Path, - score_column=None, - retention_time_column=None, - mz_column=None, - *args, - **kwargs, + *args: Any, + score_column: str | None = None, + retention_time_column: str | None = None, + mz_column: str | None = None, + **kwargs: Any, ) -> None: """ Reader for Percolator Tab PIN/POUT PSM file. @@ -48,21 +50,23 @@ def __init__( As the score, retention time, and precursor m/z are often embedded as feature columns, but not with a fixed column name, their respective column names need to be provided as parameters to the class. If not provided, these properties will - not be added to the resulting :py:class:`~psm_utils.psm.PSM`. - Nevertheless, they will still be added to its - :py:attr:`~psm_utils.psm.PSM.rescoring_features` property - dictionary, along with the other features. + not be added to the resulting PSM. Nevertheless, they will still be added to its + rescoring_features property dictionary, along with the other features. Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. - score_column: str, optional + *args + Additional positional arguments passed to parent class. + score_column Name of the column that holds the primary PSM score. - retention_time_column: str, optional + retention_time_column Name of the column that holds the retention time. - mz_column: str, optional + mz_column Name of the column that holds the precursor m/z. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) @@ -99,31 +103,28 @@ def __init__( for col in [self.score_column, self.rt_column, self.mz_column]: if col and col.lower() not in self.fieldnames: raise PercolatorIOException( - f"Column `{col}` not found in header of Percolator Tab file " - f"`{self.filename}`." + f"Column `{col}` not found in header of Percolator Tab file `{self.filename}`." ) - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with _PercolatorTabIO( self.filename, "rt", protein_separator=self._protein_separator ) as open_file: reader = csv.DictReader(open_file, delimiter="\t") for entry in reader: - if entry[self.id_column] == "DefaultDirection": - pass - else: - psm = self._parse_entry(entry) - yield psm + if entry[self.id_column] != "DefaultDirection": + yield self._parse_entry(entry) @staticmethod - def _read_header(filename): - with open(filename, "rt") as f: + def _read_header(filename: str | Path) -> list[str]: + """Read header line and return fieldnames.""" + with open(filename) as f: fieldnames = f.readline().strip().lower().split("\t") return fieldnames @staticmethod - def _infer_charge_columns(fieldnames): + def _infer_charge_columns(fieldnames: list[str]) -> tuple[str | None, dict[int, str]]: """Infer columns that hold the precursor charge from the header fieldnames.""" # Infer single charge column charge_column = None @@ -142,7 +143,7 @@ def _infer_charge_columns(fieldnames): return charge_column, charge_onehot_columns @staticmethod - def _parse_peptidoform(percolator_peptide, charge): + def _parse_peptidoform(percolator_peptide: str, charge: int | None) -> Peptidoform: """Parse Percolator TSV peptide notation to Peptidoform.""" # Remove leading and trailing amino acids match = re.match(r"^(?:[A-Z-])?\.(.+)\.(?:[A-Z-])?$", percolator_peptide) @@ -151,7 +152,7 @@ def _parse_peptidoform(percolator_peptide, charge): peptidoform += f"/{charge}" return Peptidoform(peptidoform) - def _parse_charge(self, entry) -> int | None: + def _parse_charge(self, entry: dict[str, str]) -> int | None: """Parse charge state from single or one-hot encoded charge state.""" if self.charge_column: return int(entry["charge"]) @@ -159,32 +160,32 @@ def _parse_charge(self, entry) -> int | None: for charge_state, column_name in self.charge_onehot_columns.items(): if entry[column_name] == "1": return charge_state - else: - return None + return None - def _parse_entry(self, entry): + def _parse_entry(self, entry: dict[str, str]) -> PSM: """Parse Percolator TSV entry to PSM.""" - label = entry["label"] if "label" in entry else None + label = entry.get("label") is_decoy = True if label == "-1" else False if label == "1" else None rescoring_features = { k: str(v) for k, v in entry.items() if k not in self.non_feature_columns } charge = self._parse_charge(entry) peptidoform = self._parse_peptidoform(entry["peptide"], charge) - protein_list = ( - entry["proteins"].split(self._protein_separator) - if "proteins" in entry - else entry["proteinids"].split(self._protein_separator) - if "proteinids" in entry - else None - ) - psm = PSM( + + # Get protein list + protein_list = None + if "proteins" in entry: + protein_list = entry["proteins"].split(self._protein_separator) + elif "proteinids" in entry: + protein_list = entry["proteinids"].split(self._protein_separator) + + return PSM( peptidoform=peptidoform, spectrum_id=entry[self.id_column], is_decoy=is_decoy, score=float(entry[self.score_column.lower()]) if self.score_column else None, - qvalue=entry["q-value"] if "q-value" in entry else None, - pep=entry["posterior_error_prob"] if "posterior_error_prob" in entry else None, + qvalue=entry.get("q-value"), + pep=entry.get("posterior_error_prob"), precursor_mz=float(entry[self.mz_column.lower()]) if self.mz_column else None, retention_time=float(entry[self.rt_column.lower()]) if self.rt_column else None, protein_list=protein_list, @@ -192,43 +193,47 @@ def _parse_entry(self, entry): provenance_data={"filename": str(self.filename)}, rescoring_features=rescoring_features, ) - return psm class PercolatorTabWriter(WriterBase): + """Writer for Percolator Tab PIN/POUT format.""" + def __init__( self, filename: str | Path, - style: Optional[str] = None, - feature_names: Optional[list[str]] = None, + *args: Any, + style: str | None = None, + feature_names: list[str] | None = None, add_basic_features: bool = False, - *args, - **kwargs, + **kwargs: Any, ) -> None: """ Writer for Percolator TSV "PIN" and "POUT" PSM files. Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. - style: str, optional + *args + Additional positional arguments passed to parent class. + style Percolator Tab style. One of {``pin``, ``pout``}. If ``pin``, the columns ``SpecId``, ``Label``, ``ScanNr``, ``ChargeN``, ``PSMScore``, ``Peptide``, and ``Proteins`` are written alongside the requested feature names (see ``feature_names``). If ``pout``, the columns ``PSMId``, ``Label``, ``score``, ``q-value``, ``posterior_error_prob``, ``peptide``, and ``proteinIds`` are written. By default, the style is inferred from the file name extension. - feature_names: list[str], optional + feature_names List of feature names to extract from PSMs and write to file. List values - should correspond to keys in the - :py:class:`~psm_utils.psm.PSM.rescoring_features` property. - If :py:const:`None`, no rescoring features will be written to the file. If appending to + should correspond to keys in the rescoring_features property. + If None, no rescoring features will be written to the file. If appending to an existing file, the existing header will be used to determine the feature names. Only has effect with ``pin`` style. - add_basic_features: bool, optional - If :py:const:`True`, add ``PSMScore`` and ``ChargeN`` features to the file. Only has - effect with ``pin`` style. Default is :py:const:`False`. + add_basic_features + If True, add ``PSMScore`` and ``ChargeN`` features to the file. Only has + effect with ``pin`` style. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) @@ -270,8 +275,8 @@ def __init__( else: raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.") - self._open_file = None - self._writer = None + self._open_file: _PercolatorTabIO | None = None + self._writer: csv.DictWriter[str] | None = None self._protein_separator = "|||" self._current_scannr = 0 @@ -283,7 +288,7 @@ def __enter__(self) -> PercolatorTabWriter: self.filename, mode, newline="", protein_separator=self._protein_separator ) if file_existed: - fieldnames, self._current_scannr = self._parse_existing_file(self.filename) + fieldnames, self._current_scannr = self._parse_existing_file(self.filename, self.style) else: fieldnames = self._columns self._current_scannr = -1 @@ -298,27 +303,27 @@ def __enter__(self) -> PercolatorTabWriter: return self def __exit__(self, *args, **kwargs) -> None: - self._open_file.close() + """Close file and writer.""" + if self._open_file is not None: + self._open_file.close() self._open_file = None self._writer = None - self._current_scannr = None + self._current_scannr = 0 - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """Write a single PSM to the PSM file.""" + if self._writer is None: + raise PSMUtilsIOException( + f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" + " is opened in context (i.e., using the `with` statement)." + ) entry = self._psm_to_entry(psm) self._current_scannr += 1 entry["ScanNr"] = self._current_scannr - try: - self._writer.writerow(entry) - except AttributeError as e: - raise PSMUtilsIOException( - f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" - "is opened in context (i.e., using the `with` statement)." - ) from e - else: - self._current_scannr = entry["ScanNr"] + self._writer.writerow(entry) + self._current_scannr = entry["ScanNr"] - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """Write an entire PSMList to the PSM file.""" with _PercolatorTabIO( self.filename, "wt", newline="", protein_separator=self._protein_separator @@ -330,10 +335,10 @@ def write_file(self, psm_list: PSMList): for psm in psm_list: writer.writerow(self._psm_to_entry(psm)) - def _psm_to_entry(self, psm: PSM): + def _psm_to_entry(self, psm: PSM) -> dict[str, Any]: """Parse PSM to Percolator Tab entry.""" if self.style == "pin": - entry = { + entry: dict[str, Any] = { "SpecId": psm.spectrum_id, "Label": None if psm.is_decoy is None else -1 if psm.is_decoy else 1, "Peptide": "." + re.sub(r"/\d+$", "", psm.peptidoform.proforma) + ".", @@ -343,10 +348,8 @@ def _psm_to_entry(self, psm: PSM): } if self.add_basic_features: entry.update({"ChargeN": psm.peptidoform.precursor_charge, "PSMScore": psm.score}) - try: + if psm.rescoring_features is not None: entry.update(psm.rescoring_features) - except TypeError: - pass elif self.style == "pout": entry = { "PSMId": psm.spectrum_id, @@ -359,15 +362,15 @@ def _psm_to_entry(self, psm: PSM): if psm.protein_list else None, } + else: + raise ValueError("Invalid Percolator Tab style. Should be one of {`pin`, `pout`}.") return entry @staticmethod - def _parse_existing_file( - filename: Union[str, Path], style: str - ) -> Tuple[List[str], Optional[int]]: + def _parse_existing_file(filename: str | Path, style: str) -> tuple[list[str], int]: """Parse existing Percolator Tab file to determine fieldnames and last ScanNr.""" # Get fieldnames - with open(filename, "rt") as open_file: + with open(filename) as open_file: for line in open_file: fieldnames = line.strip().split("\t") break @@ -382,7 +385,7 @@ def _parse_existing_file( # Get last ScanNr last_scannr = None - with open(filename, "rt") as open_file: + with open(filename) as open_file: # Read last line open_file.seek(0) last_line = None @@ -407,25 +410,25 @@ def _parse_existing_file( class _PercolatorTabIO: - def __init__(self, *args, protein_separator="|||", **kwargs) -> None: + def __init__(self, *args: Any, protein_separator: str = "|||", **kwargs: Any) -> None: """File reader and writer for Percolator Tab files with fixed Proteins tab.""" self._open_file = open(*args, **kwargs) self.protein_separator = protein_separator - def __enter__(self, *args, **kwargs) -> _PercolatorTabIO: + def __enter__(self) -> _PercolatorTabIO: return self - def __exit__(self, *args, **kwargs): + def __exit__(self, *args: Any) -> None: self.close() - def __iter__(self): - """Yield lines in file with Proteins tab replaced by separator.""" - number_of_columns = None + def __iter__(self) -> Iterator[str]: + """Iterate over lines in file with Proteins tab replaced by separator.""" + number_of_columns = 0 for i, line in enumerate(self._open_file): if i == 0: number_of_columns = len(line.split("\t")) yield line.lower() - elif i == 1 & line.startswith("DefaultDirection"): + elif i == 1 and line.startswith("DefaultDirection"): yield line else: r = line.strip().split("\t") @@ -435,16 +438,16 @@ def __iter__(self): line = "\t".join(row_columns) + "\n" yield line - def close(self): + def close(self) -> None: self._open_file.close() - def write(self, __s: str): + def write(self, __s: str) -> None: """Write line to file with Proteins separator replaced by tab.""" __s = __s.replace(self.protein_separator, "\t") self._open_file.write(__s) -def _fieldnames_are_valid(fieldnames: List[str], style: str) -> bool: +def _fieldnames_are_valid(fieldnames: list[str], style: str) -> bool: """Check if fieldnames are valid for Percolator Tab style.""" if style == "pin": required_columns = ["SpecId", "Label", "ScanNr"] @@ -459,15 +462,18 @@ def join_pout_files( target_filename: str | Path, decoy_filename: str | Path, output_filename: str | Path, -): +) -> None: """ Join target and decoy Percolator Out (POUT) files into single PercolatorTab file. Parameters ---------- - target_filename: str, Path - decoy_filename: str, Path - output_filename: str, Path + target_filename + Path to target POUT file. + decoy_filename + Path to decoy POUT file. + output_filename + Path to output combined POUT file. """ target_reader = PercolatorTabReader(target_filename, score_column="score") diff --git a/psm_utils/io/proteome_discoverer.py b/psm_utils/io/proteome_discoverer.py index edb7a5e..f3de36d 100644 --- a/psm_utils/io/proteome_discoverer.py +++ b/psm_utils/io/proteome_discoverer.py @@ -1,14 +1,39 @@ -"""Reader for Proteome Discoverer MSF PSM files.""" +""" +Reader for Proteome Discoverer MSF PSM files. + +This module provides functionality to read PSM data from Proteome Discoverer MSF SQLite database +files. + +The reader supports both target and decoy peptides, handles various modification types (amino acid +and terminal modifications), and extracts complete scoring information from the MSF database +structure. + +Examples +-------- +>>> from psm_utils.io.proteome_discoverer import MSFReader +>>> reader = MSFReader("results.msf") +>>> psm_list = reader.read_file() +>>> for psm in reader: +... print(f"{psm.peptidoform} - Score: {psm.score}") + +Notes +----- +MSF file versions 79, 53, and 8 are currently supported. + +""" + +from __future__ import annotations import logging import re from collections import defaultdict +from collections.abc import Iterator from pathlib import Path -from typing import Dict, List, Tuple, Union +from typing import Any -import pyteomics.proforma as proforma -from sqlalchemy import create_engine -from sqlalchemy.orm import sessionmaker +import pyteomics.proforma as proforma # type: ignore[import-untyped] +from sqlalchemy import create_engine, func, select +from sqlalchemy.orm import Session import psm_utils.io._pd_msf_tables as msf from psm_utils import PSM, Peptidoform @@ -16,42 +41,75 @@ logger = logging.getLogger(__name__) -COMPATIBLE_VERSIONS = [79] +COMPATIBLE_VERSIONS = [79, 53, 8] class MSFReader(ReaderBase): - """Reader for Proteome Discoverer MSF files.""" + """ + Reader for Proteome Discoverer MSF files. + + This reader processes SQLite-based MSF database files from Proteome Discoverer, extracting + peptide-spectrum matches with complete modification information, scoring data, and metadata. + Supports both target and decoy peptides. + + Examples + -------- + >>> reader = MSFReader("experiment.msf") + >>> psm_list = reader.read_file() + >>> len(reader) # Get total number of PSMs + 1234 + >>> for psm in reader: # Iterate over all PSMs + ... if psm.qvalue and psm.qvalue < 0.01: + ... print(f"High-confidence PSM: {psm.peptidoform}") + + """ def __init__( self, - filename: Union[str, Path], + filename: str | Path, *args, **kwargs, ) -> None: """ - Reader for Proteome Discoverer MSF file. + Initialize MSF reader with database connection and version validation. Parameters ---------- - filename: str, pathlib.Path - Path to MSF file. + filename + Path to Proteome Discoverer MSF file. + *args + Additional positional arguments passed to parent class. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) self._engine = create_engine(f"sqlite:///{self.filename.as_posix()}") - self._session = sessionmaker(bind=self._engine)() + self._session = Session(self._engine) self._check_version() - def __len__(self): - """Return number of PSMs in file.""" - return sum( - self._session.query(peptide).count() for peptide in [msf.Peptide, msf.PeptideDecoy] + def __len__(self) -> int: + """Return total number of PSMs in file.""" + peptide_count = ( + self._session.execute(select(func.count()).select_from(msf.Peptide)).scalar() or 0 ) + decoy_count = ( + self._session.execute(select(func.count()).select_from(msf.PeptideDecoy)).scalar() or 0 + ) + return peptide_count + decoy_count - def __iter__(self): - """Iterate over file and return PSMs one-by-one.""" + def __iter__(self) -> Iterator[PSM]: + """ + Iterate over file and return PSMs one-by-one. + + Yields + ------ + PSM + Individual PSM objects with complete modification and scoring information. + + """ for is_decoy in [False, True]: modifications = self._get_modifications(is_decoy) terminal_modifications = self._get_terminal_modifications(is_decoy) @@ -60,7 +118,8 @@ def __iter__(self): secondary_scores = self._get_secondary_scores(is_decoy) for entry in self._iter_peptides(is_decoy): - peptide_id = entry.PeptideDecoy.PeptideID if is_decoy else entry.Peptide.PeptideID + peptide = entry[0] # First element is Peptide or PeptideDecoy + peptide_id = peptide.PeptideID yield self._parse_entry( entry, modifications[peptide_id], @@ -71,36 +130,77 @@ def __iter__(self): is_decoy, ) - def _check_version(self): - """Check if MSF file version is compatible.""" - version = self._session.query(msf.SchemaInfo.Version).first()[0] + def __enter__(self) -> ReaderBase: + """Enter context manager for MSFReader.""" + return super().__enter__() + + def __exit__(self, *args, **kwargs) -> None: + """Exit context manager for MSFReader.""" + self._session.close() + return super().__exit__(*args, **kwargs) + + def _check_version(self) -> None: + """Check MSF file version compatibility.""" + first_result = self._session.execute(select(msf.SchemaInfo.Version)).first() + if first_result is None: + logger.warning( + "MSF file does not contain version information. " + "Assuming it is compatible with this reader." + ) + return None + version = first_result[0] if version not in COMPATIBLE_VERSIONS: logger.warning( f"MSF file version {version} might not be compatible with this reader. " f"Checked versions are: {COMPATIBLE_VERSIONS}." ) - def _iter_peptides(self, is_decoy: bool): - """Iterate over peptides in MSF file.""" - Peptide = msf.PeptideDecoy if is_decoy else msf.Peptide - for entry in ( - self._session.query(Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo) - .select_from(Peptide) - .join(msf.SpectrumHeader, Peptide.SpectrumID == msf.SpectrumHeader.SpectrumID) + def _iter_peptides(self, is_decoy: bool) -> Iterator[Any]: + """ + Iterate over peptides in MSF file. + + Parameters + ---------- + is_decoy : bool + Whether to iterate over decoy peptides instead of target peptides. + + Yields + ------ + Any + SQLAlchemy Row object containing joined Peptide, SpectrumHeader, MassPeak, and + FileInfo data. The Row object has attributes like row[0] (Peptide/PeptideDecoy), + row[1] (SpectrumHeader), row[2] (MassPeak), and row[3] (FileInfo). + + Notes + ----- + This method performs a complex join across multiple MSF database tables to gather + all necessary information for PSM construction. The returned rows contain all + spectral and identification metadata needed for downstream processing. + + """ + # Select appropriate peptide table based on decoy flag + peptide_table = msf.PeptideDecoy if is_decoy else msf.Peptide + + # Build and execute query - same structure for both target and decoy + stmt = ( + select(peptide_table, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo) + .select_from(peptide_table) + .join(msf.SpectrumHeader, peptide_table.SpectrumID == msf.SpectrumHeader.SpectrumID) .join(msf.MassPeak, msf.MassPeak.MassPeakID == msf.SpectrumHeader.MassPeakID) .join(msf.FileInfo, msf.FileInfo.FileID == msf.MassPeak.FileID) - ): - yield entry + ) - def _get_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]: - """Get all modifications per peptide ID.""" + yield from self._session.execute(stmt) + + def _get_modifications(self, is_decoy: bool) -> dict[int, list[tuple[int, int]]]: + """Get amino acid modifications per peptide ID.""" PeptidesAminoAcidModification = ( msf.PeptidesAminoAcidModificationsDecoy if is_decoy else msf.PeptidesAminoAcidModification ) - query = ( - self._session.query( + stmt = ( + select( PeptidesAminoAcidModification.PeptideID, PeptidesAminoAcidModification.Position, msf.AminoAcidModification.UnimodAccession, @@ -112,19 +212,19 @@ def _get_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]: == msf.AminoAcidModification.AminoAcidModificationID, ) ) - modifications_by_peptide = defaultdict(list) - for peptide_id, position, unimod_accession in query: + modifications_by_peptide: dict[int, list[tuple[int, int]]] = defaultdict(list) + for peptide_id, position, unimod_accession in self._session.execute(stmt): modifications_by_peptide[peptide_id].append((position, unimod_accession)) return modifications_by_peptide - def _get_terminal_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, int]]: - """Get terminal modifications for a peptide.""" + def _get_terminal_modifications(self, is_decoy: bool) -> dict[int, list[tuple[int, int]]]: + """Get terminal modifications per peptide ID.""" PeptidesTerminalModification = ( msf.PeptidesTerminalModification if is_decoy else msf.PeptidesTerminalModificationDecoy ) - query = ( - self._session.query( + stmt = ( + select( PeptidesTerminalModification.PeptideID, msf.AminoAcidModification.PositionType, msf.AminoAcidModification.UnimodAccession, @@ -136,32 +236,32 @@ def _get_terminal_modifications(self, is_decoy: bool) -> Dict[int, Tuple[int, in == msf.AminoAcidModification.AminoAcidModificationID, ) ) - terminal_modifications = defaultdict(list) - for peptide_id, position_type, unimod_accession in query: + terminal_modifications: dict[int, list[tuple[int, int]]] = defaultdict(list) + for peptide_id, position_type, unimod_accession in self._session.execute(stmt): terminal_modifications[peptide_id].append((position_type, unimod_accession)) return terminal_modifications - def _get_protein_entries(self, is_decoy: bool) -> Dict[int, List[str]]: - """Get protein descriptions or a peptide.""" + def _get_protein_entries(self, is_decoy: bool) -> dict[int, list[str]]: + """Get protein descriptions per peptide ID.""" PeptidesProtein = msf.PeptidesProteinDecoy if is_decoy else msf.PeptidesProtein - query = ( - self._session.query(PeptidesProtein.PeptideID, msf.ProteinAnnotation.Description) + stmt = ( + select(PeptidesProtein.PeptideID, msf.ProteinAnnotation.Description) .select_from(PeptidesProtein) .join( msf.ProteinAnnotation, PeptidesProtein.ProteinID == msf.ProteinAnnotation.ProteinID, ) ) - proteins = defaultdict(list) - for peptide_id, description in query: + proteins: dict[int, list[str]] = defaultdict(list) + for peptide_id, description in self._session.execute(stmt): proteins[peptide_id].append(re.sub(r"^>", "", description)) return proteins - def _get_main_score(self, is_decoy: bool) -> Dict[int, Tuple[float, str]]: - """Get main score and its name for a peptide.""" + def _get_main_score(self, is_decoy: bool) -> dict[int, tuple[float, str]]: + """Get main score and name per peptide ID.""" PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore - query = ( - self._session.query( + stmt = ( + select( PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName ) .select_from(PeptideScore) @@ -171,16 +271,16 @@ def _get_main_score(self, is_decoy: bool) -> Dict[int, Tuple[float, str]]: ) .filter(msf.ProcessingNodeScore.IsMainScore == True) # noqa: E712 ) - scores = dict() - for peptide_id, score_value, score_name in query: + scores: dict[int, tuple[float, str]] = {} + for peptide_id, score_value, score_name in self._session.execute(stmt): scores[peptide_id] = (score_value, score_name) return scores - def _get_secondary_scores(self, is_decoy: bool) -> Dict[int, Dict[str, float]]: - """Get secondary scores and their names for a peptide.""" + def _get_secondary_scores(self, is_decoy: bool) -> dict[int, dict[str, float]]: + """Get secondary scores per peptide ID.""" PeptideScore = msf.PeptideScoreDecoy if is_decoy else msf.PeptideScore - query = ( - self._session.query( + stmt = ( + select( PeptideScore.PeptideID, PeptideScore.ScoreValue, msf.ProcessingNodeScore.ScoreName ) .select_from(PeptideScore) @@ -190,8 +290,8 @@ def _get_secondary_scores(self, is_decoy: bool) -> Dict[int, Dict[str, float]]: ) .filter(msf.ProcessingNodeScore.IsMainScore == False) # noqa: E712 ) - scores = defaultdict(dict) - for peptide_id, score_value, score_name in query: + scores: dict[int, dict[str, float]] = defaultdict(dict) + for peptide_id, score_value, score_name in self._session.execute(stmt): scores[peptide_id][score_name] = score_value return scores @@ -199,8 +299,8 @@ def _compile_peptidoform( self, sequence: str, charge: int, - modifications: List[Tuple[int, int]], - terminal_modifications: List[Tuple[int, int]], + modifications: list[tuple[int, int]], + terminal_modifications: list[tuple[int, int]], ) -> Peptidoform: """ Compile a peptidoform from a sequence, charge, and list of (terminal) modifications. @@ -238,7 +338,7 @@ def _compile_peptidoform( if position_type in [2, 4] # Position types 'Any C-term' or 'Protein C-term' ] - sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(sequence)] + parsed_sequence = [(aa, modifications_dict[i] or None) for i, aa in enumerate(sequence)] properties = { "n_term": n_term, "c_term": c_term, @@ -251,48 +351,85 @@ def _compile_peptidoform( "group_ids": [], } - return Peptidoform(proforma.ProForma(sequence, properties)) + return Peptidoform(proforma.ProForma(parsed_sequence, properties)) def _parse_entry( self, - entry: Tuple[msf.Peptide, msf.SpectrumHeader, msf.MassPeak, msf.FileInfo], - modifications: List[Tuple[int, int]], - terminal_modifications: List[Tuple[int, int]], - protein_entries: List[str], - main_score: Tuple[float, str], - secondary_scores: Dict[str, float], + entry: Any, # SQLAlchemy Row[tuple[Peptide|PeptideDecoy, SpectrumHeader, MassPeak, FileInfo]] + modifications: list[tuple[int, int]], + terminal_modifications: list[tuple[int, int]], + protein_entries: list[str], + main_score: tuple[float, str], + secondary_scores: dict[str, float], is_decoy: bool, ) -> PSM: - """Parse an entry from the MSF file.""" - peptide = entry.PeptideDecoy if is_decoy else entry.Peptide + """ + Parse an entry from the MSF file into a PSM object. + + Parameters + ---------- + entry : Any + SQLAlchemy Row object containing joined peptide, spectrum, and file information. + Accessed by index: entry[0] (Peptide/PeptideDecoy), entry[1] (SpectrumHeader), + entry[2] (MassPeak), entry[3] (FileInfo). + modifications : list[tuple[int, int]] + List of tuples containing (position, UNIMOD accession) for amino acid modifications. + terminal_modifications : list[tuple[int, int]] + List of tuples containing (position_type, UNIMOD accession) for terminal modifications. + protein_entries : list[str] + List of protein descriptions associated with this peptide. + main_score : tuple[float, str] + Tuple containing (score_value, score_name) for the main search engine score. + secondary_scores : dict[str, float] + Dictionary mapping score names to values for secondary scores. + is_decoy : bool + Whether this PSM is from a decoy search. + + Returns + ------- + PSM + Complete PSM object with all available metadata and scoring information. + + Notes + ----- + This method constructs a complete PSM object by: + - Creating a peptidoform from sequence and modifications + - Extracting spectrum identification and precursor information + - Including all available scoring metrics + - Adding proteome discoverer-specific metadata + + """ + peptide = entry[0] # First element is Peptide or PeptideDecoy + spectrum_header = entry[1] # Second element is SpectrumHeader + mass_peak = entry[2] # Third element is MassPeak + file_info = entry[3] # Fourth element is FileInfo + return PSM( peptidoform=self._compile_peptidoform( peptide.Sequence, - entry.SpectrumHeader.Charge, + spectrum_header.Charge, modifications, terminal_modifications, ), - spectrum_id=entry.SpectrumHeader.LastScan, - run=Path(entry.FileInfo.FileName).stem, + spectrum_id=spectrum_header.LastScan, + run=Path(file_info.FileName).stem, is_decoy=is_decoy, score=main_score[0], qvalue=None, pep=None, - precursor_mz=entry.MassPeak.Mass, - retention_time=entry.SpectrumHeader.RetentionTime, + precursor_mz=mass_peak.Mass, + retention_time=spectrum_header.RetentionTime, ion_mobility=None, protein_list=protein_entries, rank=peptide.SearchEngineRank, source="proteome_discoverer", provenance_data={ - "scan_numbers": entry.SpectrumHeader.ScanNumbers, + "scan_numbers": spectrum_header.ScanNumbers, }, metadata={ - "ms1_intensity": str(entry.MassPeak.Intensity), - "ms1_percent_isolation_interference": str( - entry.MassPeak.PercentIsolationInterference - ), - "ms1_ion_inject_time": str(entry.MassPeak.IonInjectTime), + "ms1_intensity": str(mass_peak.Intensity), + "ms1_percent_isolation_interference": str(mass_peak.PercentIsolationInterference), + "ms1_ion_inject_time": str(mass_peak.IonInjectTime), "main_score_name": main_score[1], **secondary_scores, }, diff --git a/psm_utils/io/proteoscape.py b/psm_utils/io/proteoscape.py index ddc4386..1b0dcb3 100644 --- a/psm_utils/io/proteoscape.py +++ b/psm_utils/io/proteoscape.py @@ -2,18 +2,19 @@ import logging import re +from collections.abc import Iterator from pathlib import Path -from typing import Union +from typing import Any import numpy as np import pandas as pd -import pyarrow.parquet as pq +import pyarrow.parquet as pq # type: ignore[import] -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList from psm_utils.io._base_classes import ReaderBase from psm_utils.io.exceptions import PSMUtilsIOException from psm_utils.peptidoform import format_number_as_string +from psm_utils.psm import PSM +from psm_utils.psm_list import PSMList logger = logging.getLogger(__name__) @@ -25,26 +26,30 @@ class ProteoScapeReader(ReaderBase): def __init__( self, - filename: Union[str, Path], - *args, - **kwargs, + filename: str | Path, + *args: Any, + **kwargs: Any, ) -> None: """ Reader for ProteoScape Parquet files. Parameters ---------- - filename: str, pathlib.Path - Path to MSF file. + filename + Path to ProteoScape Parquet file. + *args + Additional positional arguments passed to the base class. + **kwargs + Additional keyword arguments passed to the base class. """ - self.filename = filename + super().__init__(filename, *args, **kwargs) - def __len__(self): + def __len__(self) -> int: """Return number of PSMs in file.""" return pq.read_metadata(self.filename).num_rows - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with pq.ParquetFile(self.filename) as reader: for batch in reader.iter_batches(): @@ -54,36 +59,36 @@ def __iter__(self): except Exception as e: raise PSMUtilsIOException(f"Error while parsing row {row}:\n{e}") from e - @classmethod - def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: + @staticmethod + def from_dataframe(dataframe: pd.DataFrame) -> PSMList: """Create a PSMList from a ProteoScape Pandas DataFrame.""" return PSMList( - psm_list=[ - cls._get_peptide_spectrum_match(cls(""), entry) - for entry in dataframe.to_dict(orient="records") - ] + psm_list=[_parse_entry(entry) for entry in dataframe.to_dict(orient="records")] # type: ignore[arg-type] ) def _parse_peptidoform( - stripped_peptide: str, ptms: np.ndarray, ptm_locations: np.ndarray, precursor_charge: int + stripped_peptide: str, + ptms: np.ndarray[Any, Any], + ptm_locations: np.ndarray[Any, Any], + precursor_charge: int, ) -> str: """Parse peptide sequence and modifications to ProForma.""" peptidoform = list(stripped_peptide) n_term = "" c_term = "" - for ptm, ptm_location in zip(ptms, ptm_locations): - ptm = format_number_as_string(ptm) + for ptm, ptm_location in zip(ptms, ptm_locations, strict=True): + ptm_str = format_number_as_string(ptm) if ptm_location == -1: - n_term = f"[{ptm}]-" + n_term = f"[{ptm_str}]-" elif ptm_location == len(peptidoform): - c_term = f"-[{ptm}]" + c_term = f"-[{ptm_str}]" else: - peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm}]" + peptidoform[ptm_location] = f"{peptidoform[ptm_location]}[{ptm_str}]" return f"{n_term}{''.join(peptidoform)}{c_term}/{precursor_charge}" -def _parse_entry(entry: dict) -> PSM: +def _parse_entry(entry: dict[str, Any]) -> PSM: """Parse a single entry from ProteoScape Parquet file to PSM object.""" return PSM( peptidoform=_parse_peptidoform( @@ -93,7 +98,7 @@ def _parse_entry(entry: dict) -> PSM: entry["precursor_charge"], ), spectrum_id=entry["ms2_id"], - run=entry.get("run", None), + run=entry.get("run"), is_decoy=all(DECOY_PATTERN.match(p) for p in entry["locus_name"]), score=entry["x_corr_score"], precursor_mz=entry["precursor_mz"], diff --git a/psm_utils/io/sage.py b/psm_utils/io/sage.py index 2d62682..1111087 100644 --- a/psm_utils/io/sage.py +++ b/psm_utils/io/sage.py @@ -10,11 +10,13 @@ import csv from abc import ABC, abstractmethod +from collections.abc import Iterator from pathlib import Path -from typing import Iterable, Optional +from typing import Any -import pyarrow.parquet as pq -from pyteomics import mass +import pandas as pd +import pyarrow.parquet as pq # type: ignore[import] +from pyteomics import mass # type: ignore[import] from psm_utils.io._base_classes import ReaderBase from psm_utils.io._utils import set_csv_field_size_limit @@ -26,38 +28,45 @@ class _SageReaderBase(ReaderBase, ABC): def __init__( - self, filename, score_column: str = "sage_discriminant_score", *args, **kwargs + self, + filename: str | Path, + *args: Any, + score_column: str = "sage_discriminant_score", + **kwargs: Any, ) -> None: """ - Reader for Sage ``results.sage.tsv`` file. + Reader for Sage results file. Parameters ---------- - filename : str or Path + filename Path to PSM file. - score_column: str, optional + *args + Additional positional arguments passed to parent class. + score_column Name of the column that holds the primary PSM score. Default is ``sage_discriminant_score``, ``hyperscore`` could also be used. + **kwargs + Additional keyword arguments passed to parent class. """ super().__init__(filename, *args, **kwargs) - self.filename = filename self.score_column = score_column @abstractmethod - def __iter__(self) -> Iterable[PSM]: + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" raise NotImplementedError("Use `SageTSVReader` or `SageParquetReader` instead.") - def _get_peptide_spectrum_match(self, psm_dict) -> PSM: + def _get_peptide_spectrum_match(self, psm_dict: dict[str, Any]) -> PSM: """Parse a single PSM from a sage PSM file.""" - rescoring_features = {} + rescoring_features: dict[str, Any] = {} for ft in RESCORING_FEATURES: try: rescoring_features[ft] = psm_dict[ft] except KeyError: continue - + ion_mobility_features = self._extract_ion_mobility_features(psm_dict) rescoring_features.update(ion_mobility_features) @@ -73,36 +82,34 @@ def _get_peptide_spectrum_match(self, psm_dict) -> PSM: score=float(psm_dict[self.score_column]), precursor_mz=self._parse_precursor_mz(psm_dict["expmass"], psm_dict["charge"]), retention_time=float(psm_dict["rt"]), - ion_mobility=rescoring_features.get("ion_mobility", None), + ion_mobility=rescoring_features.get("ion_mobility"), protein_list=psm_dict["proteins"].split(";"), source="sage", rank=int(float(psm_dict["rank"])), - provenance_data=({"sage_filename": str(self.filename)}), + provenance_data={"sage_filename": self.filename.as_posix()}, rescoring_features=rescoring_features, metadata={}, ) @staticmethod - def _parse_peptidoform(peptide: str, charge: Optional[str]) -> str: + def _parse_peptidoform(peptide: str, charge: str | None) -> str: + """Parse peptide sequence and charge to peptidoform string.""" if charge: peptide += f"/{int(float(charge))}" return peptide @staticmethod - def _parse_precursor_mz(expmass: str, charge: Optional[str]) -> Optional[float]: + def _parse_precursor_mz(expmass: str, charge: str | None) -> float | None: + """Parse experimental mass and charge to precursor m/z.""" if charge: - charge = float(charge) - expmass = float(expmass) - return (expmass + (mass.nist_mass["H"][1][0] * charge)) / charge - else: - return None - + charge_val = float(charge) + expmass_val = float(expmass) + return (expmass_val + (mass.nist_mass["H"][1][0] * charge_val)) / charge_val + return None + @staticmethod - def _extract_ion_mobility_features(psm_dict: dict) -> dict: - """ - Extract ion mobility features from the PSM dictionary if present and non-zero. - Returns a dict with the relevant keys or an empty dict. - """ + def _extract_ion_mobility_features(psm_dict: dict[str, Any]) -> dict[str, float]: + """Extract ion mobility features from the PSM dictionary if present and non-zero.""" try: ion_mob = float(psm_dict["ion_mobility"]) if ion_mob: @@ -116,33 +123,37 @@ def _extract_ion_mobility_features(psm_dict: dict) -> dict: return {} @classmethod - def from_dataframe(cls, dataframe) -> PSMList: + def from_dataframe(cls, dataframe: pd.DataFrame) -> PSMList: """Create a PSMList from a Sage Pandas DataFrame.""" return PSMList( psm_list=[ - cls._get_peptide_spectrum_match(cls(""), entry) + cls._get_peptide_spectrum_match(cls(""), entry) # type: ignore[arg-type] for entry in dataframe.to_dict(orient="records") ] ) class SageTSVReader(_SageReaderBase): - def __iter__(self) -> Iterable[PSM]: + """Reader for Sage TSV results files.""" + + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "r") as open_file: + with open(self.filename) as open_file: reader = csv.DictReader(open_file, delimiter="\t") for row in reader: row["is_decoy"] = ( True if row["label"] == "-1" else False if row["label"] == "1" else None ) - yield self._get_peptide_spectrum_match(row) + SageReader = SageTSVReader # Alias for backwards compatibility class SageParquetReader(_SageReaderBase): - def __iter__(self) -> Iterable[PSM]: + """Reader for Sage Parquet results files.""" + + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" with pq.ParquetFile(self.filename) as pq_file: for batch in pq_file.iter_batches(): diff --git a/psm_utils/io/tsv.py b/psm_utils/io/tsv.py index 268cf96..9fd66a3 100644 --- a/psm_utils/io/tsv.py +++ b/psm_utils/io/tsv.py @@ -51,8 +51,9 @@ import ast import csv import logging +from collections.abc import Iterator from pathlib import Path -from typing import Optional +from typing import Any, TextIO from pydantic import ValidationError @@ -70,9 +71,9 @@ class TSVReader(ReaderBase): """Reader for psm_utils TSV format.""" - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with open(self.filename, "rt") as open_file: + with open(self.filename) as open_file: reader = csv.DictReader(open_file, delimiter="\t") failed_rows = 0 for row in reader: @@ -91,8 +92,8 @@ def __iter__(self): failed_rows = 0 @staticmethod - def _parse_entry(entry: dict) -> dict: - """Parse single TSV entry to :py:class:`~psm_utils.psm.PSM`.""" + def _parse_entry(entry: dict[str, str | None]) -> dict[str, Any]: + """Parse single TSV entry to PSM dict.""" # Replace empty strings with None entry = {k: v if v else None for k, v in entry.items()} @@ -106,17 +107,17 @@ def _parse_entry(entry: dict) -> dict: ) from e # Extract dict properties - parsed_entry = {} - provenance_data = {} - metadata = {} - rescoring_features = {} + parsed_entry: dict[str, Any] = {} + provenance_data: dict[str, str | None] = {} + metadata: dict[str, str | None] = {} + rescoring_features: dict[str, str | None] = {} for k, v in entry.items(): if k.startswith("provenance:"): - provenance_data[k[11:]] = str(v) + provenance_data[k[11:]] = v elif k.startswith("meta:"): - metadata[k[5:]] = str(v) + metadata[k[5:]] = v elif k.startswith("rescoring:"): - rescoring_features[k[10:]] = str(v) + rescoring_features[k[10:]] = v else: parsed_entry[k] = v @@ -132,23 +133,25 @@ def _parse_entry(entry: dict) -> dict: class TSVWriter(WriterBase): - """Reader for psm_utils TSV format.""" + """Writer for psm_utils TSV format.""" def __init__( self, filename: str | Path, - example_psm: Optional[PSM] = None, - *args, - **kwargs, - ): + *args: Any, + example_psm: PSM | None = None, + **kwargs: Any, + ) -> None: """ - Reader for psm_utils TSV format. + Writer for psm_utils TSV format. Parameters ---------- - filename: str, Pathlib.Path + filename Path to PSM file. - example_psm: psm_utils.psm.PSM, optional + *args + Additional positional arguments passed to the base class. + example_psm Example PSM, required to extract the column names when writing to a new file. Should contain all fields that are to be written to the PSM file, i.e., all items in the :py:attr:`provenance_data`, :py:attr:`metadata`, and @@ -156,23 +159,27 @@ def __init__( not present in the example PSM will not be written to the file, even though they are present in other PSMs passed to :py:meth:`write_psm` or :py:meth:`write_file`. + **kwargs + Additional keyword arguments passed to the base class. + """ super().__init__(filename, *args, **kwargs) - self._open_file = None - self._writer = None + self._open_file: TextIO | None = None + self._writer: csv.DictWriter[str] | None = None if example_psm: - self.fieldnames = self._psm_to_entry(example_psm).keys() + self.fieldnames: list[str] | None = list(self._psm_to_entry(example_psm).keys()) else: self.fieldnames = None def __enter__(self) -> TSVWriter: + """Enter context manager for file writing.""" if Path(self.filename).is_file(): - with open(self.filename, "rt") as open_file: + with open(self.filename) as open_file: # Get fieldnames self.fieldnames = open_file.readline().strip().split("\t") - self._open_file = open(self.filename, "at", newline="") + self._open_file = open(self.filename, "a", newline="") self._writer = csv.DictWriter( self._open_file, fieldnames=self.fieldnames, @@ -182,7 +189,7 @@ def __enter__(self) -> TSVWriter: else: if not self.fieldnames: raise ValueError("`example_psm` required when writing to new file.") - self._open_file = open(self.filename, "wt", newline="") + self._open_file = open(self.filename, "w", newline="") self._writer = csv.DictWriter( self._open_file, fieldnames=self.fieldnames, @@ -192,43 +199,44 @@ def __enter__(self) -> TSVWriter: self._writer.writeheader() return self - def __exit__(self, *args, **kwargs) -> None: - self._open_file.close() - self._open_file = None + def __exit__(self, *args: Any, **kwargs: Any) -> None: + """Exit context manager and clean up file resources.""" + if self._open_file is not None: + self._open_file.close() + self._open_file = None self._writer = None - def write_psm(self, psm: PSM): + def write_psm(self, psm: PSM) -> None: """ Write a single PSM to new or existing PSM file. Parameters ---------- - psm: PSM + psm PSM object to write. """ - entry = self._psm_to_entry(psm) - try: - self._writer.writerow(entry) - except AttributeError as e: + if self._writer is None: raise PSMUtilsIOException( f"`write_psm` method can only be called if `{self.__class__.__qualname__}`" "is opened in context (i.e., using the `with` statement)." - ) from e + ) + entry = self._psm_to_entry(psm) + self._writer.writerow(entry) - def write_file(self, psm_list: PSMList): + def write_file(self, psm_list: PSMList) -> None: """ Write an entire PSMList to a new PSM file. Parameters ---------- - psm_list: PSMList + psm_list PSMList object to write to file. """ if not self.fieldnames: raise ValueError("`example_psm` required when writing to new file.") - with open(self.filename, "wt", newline="") as f: + with open(self.filename, "w", newline="") as f: writer = csv.DictWriter( f, fieldnames=self.fieldnames, delimiter="\t", extrasaction="ignore" ) @@ -237,7 +245,8 @@ def write_file(self, psm_list: PSMList): writer.writerow(self._psm_to_entry(psm)) @staticmethod - def _psm_to_entry(psm: PSM) -> dict: + def _psm_to_entry(psm: PSM) -> dict[str, Any]: + """Convert PSM object to dictionary entry for TSV writing.""" entry = psm.__dict__.copy() # Convert Peptidoform to proforma sequence diff --git a/psm_utils/io/xtandem.py b/psm_utils/io/xtandem.py index 43c9966..1102e2a 100644 --- a/psm_utils/io/xtandem.py +++ b/psm_utils/io/xtandem.py @@ -1,10 +1,8 @@ """ Interface with X!Tandem XML PSM files. - Notes ----- - * In X!Tandem XML, N/C-terminal modifications are encoded as normal modifications and are therefore parsed accordingly. Any information on which modifications are N/C-terminal is therefore lost. @@ -35,6 +33,7 @@ .. code-block:: [+39,99545] + """ from __future__ import annotations @@ -42,11 +41,12 @@ import logging import re import xml.etree.ElementTree as ET +from collections.abc import Iterator from pathlib import Path -from typing import Union +from typing import Any import numpy as np -from pyteomics import tandem +from pyteomics import tandem # type: ignore[import] from psm_utils.exceptions import PSMUtilsException from psm_utils.io._base_classes import ReaderBase @@ -57,33 +57,38 @@ class XTandemReader(ReaderBase): + """Reader for X!Tandem XML PSM files.""" + def __init__( self, - filename: Union[str, Path], - *args, - decoy_prefix="DECOY_", - score_key="expect", - **kwargs, + filename: str | Path, + *args: Any, + decoy_prefix: str = "DECOY_", + score_key: str = "expect", + **kwargs: Any, ) -> None: """ Reader for X!Tandem XML PSM files. Parameters ---------- - filename: str, pathlib.Path + filename Path to PSM file. - decoy_prefix: str, optional + *args + Additional positional arguments passed to parent class. + decoy_prefix Protein name prefix used to denote decoy protein entries. Default: ``"DECOY_"``. - score_key: str, optional + score_key Key of score to use as PSM score. One of ``"expect"``, ``"hyperscore"``, ``"delta"``, or ``"nextscore"``. Default: ``"expect"``. The ``"expect"`` score (e-value) is converted to its negative natural logarithm to facilitate downstream analysis. + **kwargs + Additional keyword arguments passed to parent class. Examples -------- - :py:class:`XTandemReader` supports iteration: >>> from psm_utils.io.xtandem import XTandemReader @@ -101,30 +106,29 @@ def __init__( >>> psm_list = reader.read_file() """ - super().__init__(filename) + super().__init__(filename, *args, **kwargs) self.decoy_prefix = decoy_prefix self.score_key = score_key - def __iter__(self): + def __iter__(self) -> Iterator[PSM]: """Iterate over file and return PSMs one-by-one.""" - with tandem.read(str(self.filename)) as reader: run = self._parse_run(self.filename) for entry in reader: - for psm in self._parse_entry(entry, run): - yield psm + yield from self._parse_entry(entry, run) @staticmethod - def _parse_peptidoform(peptide_entry, charge): + def _parse_peptidoform(peptide_entry: dict[str, Any], charge: int) -> Peptidoform: + """Parse peptidoform from X!Tandem peptide entry.""" if "aa" in peptide_entry: # Parse modifications - seq_list = list(peptide_entry["seq"]) - unmodified_seq = seq_list.copy() + seq_list: list[str] = list(peptide_entry["seq"]) + unmodified_seq: list[str] = seq_list.copy() for mod_entry in peptide_entry["aa"]: # Locations are encoded relative to position in protein - mod_loc = mod_entry["at"] - peptide_entry["start"] - mass_shift = float(mod_entry["modified"]) + mod_loc: int = mod_entry["at"] - peptide_entry["start"] + mass_shift: float = float(mod_entry["modified"]) # Check if site matches amino acid if not mod_entry["type"] == unmodified_seq[mod_loc]: @@ -136,7 +140,7 @@ def _parse_peptidoform(peptide_entry, charge): # Add to sequence in ProForma format seq_list[mod_loc] += f"[{format_number_as_string(mass_shift)}]" - proforma_seq = "".join(seq_list) + proforma_seq: str = "".join(seq_list) else: # No modifications to parse @@ -146,17 +150,17 @@ def _parse_peptidoform(peptide_entry, charge): return Peptidoform(proforma_seq) - def _parse_entry(self, entry, run: str) -> list: - """Parse X!Tandem XML entry to a list of :py:class:`~psm_utils.psm.PSM`.""" - pepform_to_psms = dict() + def _parse_entry(self, entry: dict[str, Any], run: str) -> list[PSM]: + """Parse X!Tandem XML entry to a list of PSMs.""" + peptidoform_psm_dict: dict[Peptidoform, PSM] = {} for protein_entry in entry["protein"]: peptide_entry = protein_entry["peptide"] peptidoform = self._parse_peptidoform(peptide_entry, entry["z"]) - if peptidoform not in pepform_to_psms: + if peptidoform not in peptidoform_psm_dict: psm = PSM( - peptidoform=self._parse_peptidoform(peptide_entry, entry["z"]), + peptidoform=peptidoform, spectrum_id=entry["support"]["fragment ion mass spectrum"]["note"], is_decoy=protein_entry["label"].startswith(self.decoy_prefix), score=( @@ -170,7 +174,7 @@ def _parse_entry(self, entry, run: str) -> list: protein_list=[protein_entry["note"]], source="X!Tandem", provenance_data={ - "xtandem_filename": str(self.filename), + "xtandem_filename": self.filename.as_posix(), "xtandem_id": str(entry["id"]), }, metadata={ @@ -179,21 +183,26 @@ def _parse_entry(self, entry, run: str) -> list: "xtandem_nextscore": str(peptide_entry["nextscore"]), }, ) - pepform_to_psms[peptidoform] = psm + peptidoform_psm_dict[peptidoform] = psm else: - pepform_to_psms[peptidoform].protein_list.append(protein_entry["note"]) - - return list(pepform_to_psms.values()) + psm_protein_list = peptidoform_psm_dict[peptidoform].protein_list + if psm_protein_list is None: + peptidoform_psm_dict[peptidoform].protein_list = [protein_entry["note"]] + else: + psm_protein_list.append(protein_entry["note"]) - def _parse_run(self, filepath): - """Parse X!Tandem XML run to :py:class:`~psm_utils.psm.PSM`.""" + return list(peptidoform_psm_dict.values()) + def _parse_run(self, filepath: str | Path) -> str: + """Parse run name from X!Tandem XML file.""" tree = ET.parse(str(filepath)) root = tree.getroot() - full_label = root.attrib["label"] - run_match = re.search(r"\/(?P[^\s\/\\]+)\.(?Pmgf|mzML|mzml)", full_label) + full_label: str = root.attrib["label"] + run_match: re.Match[str] | None = re.search( + r"\/(?P[^\s\/\\]+)\.(?Pmgf|mzML|mzml)", full_label + ) if run_match: - run = run_match.group("run") + run: str = run_match.group("run") else: run = Path(filepath).stem logger.warning( @@ -205,8 +214,12 @@ def _parse_run(self, filepath): class XTandemException(PSMUtilsException): + """Base exception for X!Tandem related errors.""" + pass class XTandemModificationException(XTandemException): + """Exception raised for unexpected modifications in X!Tandem XML files.""" + pass diff --git a/psm_utils/peptidoform.py b/psm_utils/peptidoform.py index f92e297..bb7335f 100644 --- a/psm_utils/peptidoform.py +++ b/psm_utils/peptidoform.py @@ -1,21 +1,22 @@ +"""Peptidoform module for handling peptide sequences with modifications and charge states.""" + from __future__ import annotations from collections import defaultdict -from typing import Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Literal, TypedDict, cast import numpy as np -from pyteomics import mass, proforma +from pyteomics import mass, proforma # type: ignore[import] from psm_utils.exceptions import PSMUtilsException from psm_utils.utils import mass_to_mz class Peptidoform: - """ - Peptide sequence, modifications and charge state represented in ProForma notation. - """ + """Peptide sequence, modifications and charge state represented in ProForma notation.""" - def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: + def __init__(self, proforma_sequence: str | proforma.ProForma) -> None: """ Peptide sequence, modifications and charge state represented in ProForma notation. @@ -29,8 +30,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: ---------- parsed_sequence : list List of tuples with residue and modifications for each location. - properties : dict[str, Any] - Dict with sequence-wide properties. + properties : :py:class:`PeptidoformProperties` + Dictionary with properties of the peptidoform, including N- and C-terminal + modifications, unlocalized modifications, labile modifications, fixed + modifications, and charge state. Examples -------- @@ -39,6 +42,10 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: 711.2567622919099 """ + self.parsed_sequence: list[tuple[str, list[proforma.TagBase] | None]] + self.properties: PeptidoformProperties + + # Parse ProForma if isinstance(proforma_sequence, str): try: self.parsed_sequence, self.properties = proforma.parse(proforma_sequence) @@ -58,29 +65,45 @@ def __init__(self, proforma_sequence: Union[str, proforma.ProForma]) -> None: raise NotImplementedError("Peptidoforms with isotopes are currently not supported.") def __repr__(self) -> str: + """Return a string representation of the Peptidoform object.""" return f"{self.__class__.__qualname__}('{self.proforma}')" def __str__(self) -> str: + """Return the ProForma representation of the Peptidoform.""" return self.proforma def __hash__(self) -> int: + """Return a hash of the Peptidoform based on its ProForma representation.""" return hash(self.proforma) - def __eq__(self, __o: Union[Peptidoform, str]) -> bool: + def __eq__(self, __o: object) -> bool: + """Check equality of Peptidoform with another object.""" if isinstance(__o, str): return self.proforma == __o - elif isinstance(__o, Peptidoform): + elif isinstance(__o, Peptidoform): # type: ignore[return] return self.proforma == __o.proforma else: - raise TypeError(f"Cannot compare {type(__o)} with Peptidoform.") + raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}") + + def __lt__(self, __o: object) -> bool: + """Check if this Peptidoform is less than another object.""" + if isinstance(__o, str): + return self.proforma < __o + elif isinstance(__o, Peptidoform): + return self.proforma < __o.proforma + else: + raise TypeError(f"Unsupported comparison type for Peptidoform: {type(__o)}") - def __iter__(self) -> Iterable[Tuple[str, Union[None, List[proforma.TagBase]]]]: + def __iter__(self) -> Iterable[tuple[str, None | list[proforma.TagBase]]]: + """Return an iterator over the parsed sequence.""" return self.parsed_sequence.__iter__() def __len__(self) -> int: + """Return the length of the parsed sequence.""" return self.parsed_sequence.__len__() - def __getitem__(self, key: int) -> Tuple[str, Union[None, List[proforma.TagBase]]]: + def __getitem__(self, key: int) -> tuple[str, None | list[proforma.TagBase]]: + """Get the item at the specified index from the parsed sequence.""" return self.parsed_sequence.__getitem__(key) @property @@ -122,7 +145,7 @@ def modified_sequence(self) -> str: 'AC[U:4]DEK' """ - properties_without_charge = self.properties.copy() + properties_without_charge = dict(self.properties).copy() properties_without_charge.pop("charge_state", None) return proforma.to_proforma(self.parsed_sequence, **properties_without_charge) @@ -154,15 +177,14 @@ def is_modified(self) -> bool: modifications. """ - mod_properties = [ - "n_term", - "c_term", - "unlocalized_modifications", - "labile_modifications", - "fixed_modifications", - ] has_sequential = any(mods for _, mods in self.parsed_sequence) - has_other = any([self.properties[prop] for prop in mod_properties]) + has_other = ( + bool(self.properties["n_term"]) + or bool(self.properties["c_term"]) + or bool(self.properties["unlocalized_modifications"]) + or bool(self.properties["labile_modifications"]) + or bool(self.properties["fixed_modifications"]) + ) return has_sequential or has_other @property @@ -188,8 +210,9 @@ def sequential_composition(self) -> list[mass.Composition]: # Get compositions for fixed modifications by amino acid fixed_rules = {} for rule in self.properties["fixed_modifications"]: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.composition + if rule.targets is not None: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.composition comp_list = [] @@ -220,11 +243,12 @@ def sequential_composition(self) -> list[mass.Composition]: # Localized modifications if tags: for tag in tags: + tag = cast(proforma.ModificationBase, tag) try: position_comp += tag.composition except (AttributeError, KeyError) as e: raise ModificationException( - "Cannot resolve composition for modification " f"{tag.value}." + f"Cannot resolve composition for modification {tag.value}." ) from e comp_list.append(position_comp) @@ -275,7 +299,7 @@ def composition(self) -> mass.Composition: return comp @property - def sequential_theoretical_mass(self) -> float: + def sequential_theoretical_mass(self) -> list[float]: """ Monoisotopic mass of both termini and each (modified) residue. @@ -296,8 +320,9 @@ def sequential_theoretical_mass(self) -> float: """ fixed_rules = {} for rule in self.properties["fixed_modifications"]: - for aa in rule.targets: - fixed_rules[aa] = rule.modification_tag.mass + if rule.targets is not None: + for aa in rule.targets: + fixed_rules[aa] = rule.modification_tag.mass mass_list = [] @@ -326,11 +351,12 @@ def sequential_theoretical_mass(self) -> float: # Localized modifications if tags: for tag in tags: + tag = cast(proforma.ModificationBase, tag) try: position_mass += tag.mass except (AttributeError, KeyError) as e: raise ModificationException( - "Cannot resolve mass for modification " f"{tag.value}." + f"Cannot resolve mass for modification {tag.value}." ) from e mass_list.append(position_mass) @@ -350,7 +376,8 @@ def sequential_theoretical_mass(self) -> float: @property def theoretical_mass(self) -> float: - """Monoisotopic mass of the full uncharged peptidoform. + """ + Monoisotopic mass of the full uncharged peptidoform. Includes all modifications, also labile and unlocalized. @@ -409,7 +436,7 @@ def rename_modifications(self, mapping: dict[str, str]) -> None: requires renaming. Modification labels that are not in the mapping will not be renamed. - See also + See Also -------- psm_utils.psm_list.PSMList.rename_modifications @@ -425,29 +452,41 @@ def rename_modifications(self, mapping: dict[str, str]) -> None: """ - def _rename_modification_list(mods): + def _rename_modification_list( + mods: list[proforma.TagBase] | None, + ) -> list[proforma.TagBase] | None: + if mods is None: + return None + new_mods = [] for mod in mods: - try: - if isinstance(mod, proforma.MassModification): - mod_value = format_number_as_string(mod.value) - else: - mod_value = mod.value - if mod_value in mapping: - new_mods.append(proforma.process_tag_tokens(mapping[mod_value])) - else: - new_mods.append(mod) - except AttributeError: - if isinstance(mod, proforma.ModificationRule): - if mod.modification_tag.value in mapping: - mod.modification_tag = proforma.process_tag_tokens( - mapping[mod.modification_tag.value] - ) - new_mods.append(mod) - else: - mod.value # re-raise AttributeError + # Get value of the tag, formatted as string + if isinstance(mod, proforma.MassModification): + mod_value = format_number_as_string(mod.value) + else: + mod_value = mod.value + + # Rename modification if it is in the mapping + if mod_value in mapping: + new_mods.append(proforma.process_tag_tokens(mapping[mod_value])) + else: + new_mods.append(mod) + return new_mods + def _rename_modification_rule_list( + rules: list[proforma.ModificationRule] | None, + ) -> None: + if rules is None: + return None + + for rule in rules: + # Rename modification tag if it is in the mapping + if rule.modification_tag.value in mapping: + rule.modification_tag = proforma.process_tag_tokens( + mapping[rule.modification_tag.value] + ) + # Sequential modifications for i, (aa, mods) in enumerate(self.parsed_sequence): if mods: @@ -455,15 +494,15 @@ def _rename_modification_list(mods): self.parsed_sequence[i] = (aa, new_mods) # Non-sequence modifications - for mod_type in [ + for mod_type in ( "n_term", "c_term", "unlocalized_modifications", - "labile_modifications", - "fixed_modifications", - ]: - if self.properties[mod_type]: - self.properties[mod_type] = _rename_modification_list(self.properties[mod_type]) + ): + self.properties[mod_type] = _rename_modification_list(self.properties[mod_type]) # type: ignore[assignment] + + # Modification rules + _rename_modification_rule_list(self.properties["fixed_modifications"]) def add_fixed_modifications( self, modification_rules: list[tuple[str, list[str]]] | dict[str, list[str]] @@ -475,7 +514,7 @@ def add_fixed_modifications( added in the "fixed modifications" notation, at the front of the ProForma sequence. - See also + See Also -------- psm_utils.peptidoform.Peptidoform.apply_fixed_modifications @@ -496,15 +535,14 @@ def add_fixed_modifications( """ if isinstance(modification_rules, dict): - modification_rules = modification_rules.items() - modification_rules = [ + modification_rules = list(modification_rules.items()) + + parsed_modification_rules = [ proforma.ModificationRule(proforma.process_tag_tokens(mod), targets) for mod, targets in modification_rules ] - if self.properties["fixed_modifications"]: - self.properties["fixed_modifications"].extend(modification_rules) - else: - self.properties["fixed_modifications"] = modification_rules + + self.properties.setdefault("fixed_modifications", []).extend(parsed_modification_rules) def apply_fixed_modifications(self): """ @@ -514,7 +552,7 @@ def apply_fixed_modifications(self): (once at the beginning of the sequence) as modifications throughout the sequence at each affected amino acid residue. - See also + See Also -------- psm_utils.peptidoform.Peptidoform.apply_fixed_modifications @@ -530,8 +568,9 @@ def apply_fixed_modifications(self): # Setup target_aa -> modification_list dictionary rule_dict = defaultdict(list) for rule in self.properties["fixed_modifications"]: - for target_aa in rule.targets: - rule_dict[target_aa].append(rule.modification_tag) + if rule.targets is not None: + for target_aa in rule.targets: + rule_dict[target_aa].append(rule.modification_tag) # Apply modifications to sequence for i, (aa, site_mods) in enumerate(self.parsed_sequence): @@ -553,6 +592,25 @@ def apply_fixed_modifications(self): self.properties["fixed_modifications"] = [] +class PeptidoformProperties(TypedDict): + """Property items of a :py:class:`Peptidoform`.""" + + n_term: list[proforma.ModificationBase] | None + c_term: list[proforma.ModificationBase] | None + unlocalized_modifications: list[proforma.ModificationBase] + labile_modifications: list[proforma.ModificationBase] + fixed_modifications: list[proforma.ModificationRule] + charge_state: proforma.ChargeState + isotopes: list[proforma.StableIsotope] + + +_ModificationsProperty = Literal[ + "n_term", "c_term", "unlocalized_modifications", "labile_modifications" +] + +_ModificationRulesProperty = Literal["fixed_modifications"] + + def format_number_as_string(num): """Format number as string for ProForma mass modifications.""" # Using this method over `:+g` string formatting to avoid rounding and scientific notation diff --git a/psm_utils/psm.py b/psm_utils/psm.py index 2d01d08..4fd3520 100644 --- a/psm_utils/psm.py +++ b/psm_utils/psm.py @@ -1,8 +1,10 @@ +"""PSM module for handling peptide-spectrum matches.""" + from __future__ import annotations -from typing import Any, Dict, List, Optional, Union +from typing import Any -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, field_validator from psm_utils.peptidoform import Peptidoform @@ -10,29 +12,41 @@ class PSM(BaseModel): """Data class representing a peptide-spectrum match (PSM).""" - peptidoform: Union[Peptidoform, str] - spectrum_id: Union[str] - run: Optional[str] = None - collection: Optional[str] = None - spectrum: Optional[Any] = None - is_decoy: Optional[bool] = None - score: Optional[float] = None - qvalue: Optional[float] = None - pep: Optional[float] = None - precursor_mz: Optional[float] = None - retention_time: Optional[float] = None - ion_mobility: Optional[float] = None - protein_list: Optional[List[str]] = None - rank: Optional[int] = None - source: Optional[str] = None - provenance_data: Optional[Dict[str, str]] = dict() - metadata: Optional[Dict[str, str]] = dict() - rescoring_features: Optional[Dict[str, float]] = dict() + peptidoform: Peptidoform + spectrum_id: str + run: str | None = None + collection: str | None = None + spectrum: Any | None = None + is_decoy: bool | None = None + score: float | None = None + qvalue: float | None = None + pep: float | None = None + precursor_mz: float | None = None + retention_time: float | None = None + ion_mobility: float | None = None + protein_list: list[str] | None = None + rank: int | None = None + source: str | None = None + provenance_data: dict[str, str] | None = dict() + metadata: dict[str, str] | None = dict() + rescoring_features: dict[str, float] | None = dict() + model_config = ConfigDict(arbitrary_types_allowed=True, coerce_numbers_to_str=True) - def __init__(self, **data): + @field_validator("peptidoform", mode="before") + @classmethod + def validate_peptidoform(cls, v: Peptidoform | str) -> Peptidoform: + """Convert string to Peptidoform if needed.""" + if isinstance(v, str): + return Peptidoform(v) + elif isinstance(v, Peptidoform): + return v + else: + raise TypeError(f"Peptidoform or str expected for `peptidoform`, not `{type(v)}`.") + + def __init__(self, **data: Any) -> None: # noqa: D417 """ - Data class representing a peptide-spectrum match (PSM). + Initialize a peptide-spectrum match (PSM). Links a :class:`~psm_utils.peptidoform.Peptidoform` to an observed spectrum and holds the related information. Attribute types are coerced and enforced upon @@ -87,27 +101,27 @@ def __init__(self, **data): """ super().__init__(**data) - # Parse peptidoform - if isinstance(self.peptidoform, str): - self.peptidoform = Peptidoform(self.peptidoform) - elif not isinstance(self.peptidoform, Peptidoform): - raise TypeError( - f"Peptidoform or str expected for `peptidoform`, not `{type(self.peptidoform)}`." - ) - def __getitem__(self, item) -> any: + def __getitem__(self, item) -> Any: + """Get an attribute of the PSM.""" return getattr(self, item) - def __setitem__(self, item, value: any) -> None: + def __setitem__(self, item, value: Any) -> None: + """Set an attribute of the PSM.""" setattr(self, item, value) @property def precursor_mz_error(self) -> float: """Difference between observed and theoretical m/z in Da.""" theoretical_mz = self.peptidoform.theoretical_mz + if theoretical_mz is None or self.precursor_mz is None: + raise ValueError( + "Cannot calculate precursor m/z error: " + "precursor m/z is not set or theoretical m/z cannot be calculated." + ) return self.precursor_mz - theoretical_mz - def get_precursor_charge(self) -> int: + def get_precursor_charge(self) -> int | None: """Precursor charge, as embedded in :py:attr:`PSM.peptidoform`.""" return self.peptidoform.precursor_charge diff --git a/psm_utils/psm_list.py b/psm_utils/psm_list.py index c38ffdf..9309125 100644 --- a/psm_utils/psm_list.py +++ b/psm_utils/psm_list.py @@ -1,12 +1,15 @@ +"""PSMList module for handling collections of PSMs.""" + from __future__ import annotations import re -from typing import Iterable, List, Sequence +from collections.abc import Iterator, Sequence +from typing import cast, overload import numpy as np import pandas as pd from pydantic import BaseModel -from pyteomics import auxiliary, proforma +from pyteomics import auxiliary, proforma # type: ignore[import] from rich.pretty import pretty_repr from psm_utils.psm import NUMPY_DTYPES, PSM @@ -15,11 +18,11 @@ class PSMList(BaseModel): """Data class representing a list of PSMs.""" - psm_list: List[PSM] + psm_list: list[PSM] - def __init__(__pydantic_self__, **data) -> None: + def __init__(__pydantic_self__, **data) -> None: # type: ignore[override] # noqa: D417 """ - Data class representing a list of PSMs, with some useful functionality. + Represent a list of PSMs in a data class with added functionality. Parameters ---------- @@ -72,25 +75,41 @@ def __init__(__pydantic_self__, **data) -> None: super().__init__(**data) def __rich_repr__(self): + """Rich representation of the PSMList.""" yield "psm_list", self.psm_list def __repr__(self): + """Return a pretty representation of the PSMList.""" return pretty_repr(self, max_length=5) def __str__(self): + """Return a string representation of the PSMList.""" return self.__repr__() - def __add__(self, other): + def __add__(self, other: PSMList) -> PSMList: + """Concatenate two PSMLists.""" return PSMList(psm_list=self.psm_list + other.psm_list) - def __iter__(self) -> Iterable[PSM]: - return self.psm_list.__iter__() + def __iter__(self) -> Iterator[PSM]: # type: ignore[override] + """Iterate over the PSMList.""" + return iter(self.psm_list) def __len__(self) -> int: - return self.psm_list.__len__() + """Return the length of the PSMList.""" + return len(self.psm_list) + + @overload + def __getitem__(self, item: int | np.integer) -> PSM: ... + + @overload + def __getitem__(self, item: slice | Sequence[bool | int] | np.ndarray) -> PSMList: ... - def __getitem__(self, item) -> PSM | list[PSM]: - if isinstance(item, (int, np.integer)): + @overload + def __getitem__(self, item: str | Sequence[str]) -> np.ndarray: ... + + def __getitem__(self, item) -> PSM | PSMList | np.ndarray: + """Get PSM or PSMList by index, slice, or property name.""" + if isinstance(item, int | np.integer): # Return single PSM by index return self.psm_list[item] elif isinstance(item, slice): @@ -119,6 +138,13 @@ def __getitem__(self, item) -> PSM | list[PSM]: raise TypeError(f"Unsupported indexing type: {type(item)}") def __setitem__(self, item, values: Sequence) -> None: + """ + Set PSM property values for all PSMs in :py:class:`PSMList`. + + If the length of `values` does not match the length of the PSMList, + a ValueError is raised. + + """ if not len(values) == len(self): raise ValueError(f"Expected value with same length as PSMList: {len(self)}") for value, psm in zip(values, self): @@ -127,16 +153,18 @@ def __setitem__(self, item, values: Sequence) -> None: @property def collections(self) -> list: """List of collections in :py:class:`PSMList`.""" - if (self["collection"] != None).any(): # noqa: E711 - return list(np.unique(self["collection"])) + collection_array = np.asarray(self["collection"]) + if (collection_array != None).any(): # noqa: E711 + return np.unique(collection_array).tolist() else: return [None] @property def runs(self) -> list: """List of runs in :py:class:`PSMList`.""" - if (self["run"] != None).any(): # noqa: E711 - return list(np.unique(self["run"])) + run_array = np.asarray(self["run"]) + if (run_array != None).any(): # noqa: E711 + return np.unique(run_array).tolist() else: return [None] @@ -168,14 +196,14 @@ def set_ranks(self, lower_score_better: bool = False): """Set identification ranks for all PSMs in :py:class:`PSMList`.""" columns = ["collection", "run", "spectrum_id", "score"] self["rank"] = ( - pd.DataFrame(self[columns], columns=columns) + pd.DataFrame(np.array([self[c] for c in columns]).transpose(), columns=columns) .sort_values("score", ascending=lower_score_better) .fillna(0) # groupby does not play well with None values .groupby(["collection", "run", "spectrum_id"]) .cumcount() .sort_index() + 1 # 1-based counting - ) + ).to_list() def get_rank1_psms(self, *args, **kwargs) -> PSMList: """ @@ -184,9 +212,10 @@ def get_rank1_psms(self, *args, **kwargs) -> PSMList: First runs :py:meth:`~set_ranks` with ``*args`` and ``**kwargs`` if if any PSM has no rank yet. """ - if None in self["rank"]: + rank_array = np.asarray(self["rank"]) + if None in rank_array: self.set_ranks(*args, **kwargs) - return self[self["rank"] == 1] + return PSMList(psm_list=[self.psm_list[i] for i in np.flatnonzero(rank_array == 1)]) def find_decoys(self, decoy_pattern: str) -> None: """ @@ -211,9 +240,12 @@ def find_decoys(self, decoy_pattern: str) -> None: >>> psm_list.find_decoys(r"^DECOY_") """ - decoy_pattern = re.compile(decoy_pattern) + pattern = re.compile(decoy_pattern) for psm in self: - psm.is_decoy = all([decoy_pattern.search(p) is not None for p in psm.protein_list]) + if psm.protein_list is not None: + psm.is_decoy = all(pattern.search(p) is not None for p in psm.protein_list) + else: + psm.is_decoy = None def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None: """ @@ -233,7 +265,7 @@ def calculate_qvalues(self, reverse: bool = True, **kwargs) -> None: """ for key in ["score", "is_decoy"]: - if (self[key] == None).any(): # noqa: E711 (self[key] is a Numpy array) + if (np.asarray(self[key]) == None).any(): # noqa: E711 (self[key] is a Numpy array) raise ValueError( f"Cannot calculate q-values if not all PSMs have `{key}` assigned." ) @@ -264,7 +296,7 @@ def rename_modifications(self, mapping: dict[str, str]) -> None: requires renaming. Modification labels that are not in the mapping will not be renamed. - See also + See Also -------- psm_utils.peptidoform.Peptidoform.rename_modifications @@ -282,7 +314,7 @@ def add_fixed_modifications( added in the "fixed modifications" notation, at the front of the ProForma sequence. - See also + See Also -------- psm_utils.peptidoform.Peptidoform.add_fixed_modifications @@ -294,16 +326,17 @@ def add_fixed_modifications( """ if isinstance(modification_rules, dict): - modification_rules = modification_rules.items() - modification_rules = [ + modification_rules = list(modification_rules.items()) + + parsed_modification_rules = [ proforma.ModificationRule(proforma.process_tag_tokens(mod), targets) for mod, targets in modification_rules ] + for psm in self.psm_list: - if psm.peptidoform.properties["fixed_modifications"]: - psm.peptidoform.properties["fixed_modifications"].extend(modification_rules) - else: - psm.peptidoform.properties["fixed_modifications"] = modification_rules + psm.peptidoform.properties.setdefault("fixed_modifications", []).extend( # type: ignore[union-attr] + cast(list, parsed_modification_rules) + ) def apply_fixed_modifications(self): """ @@ -312,7 +345,7 @@ def apply_fixed_modifications(self): Applies :py:meth:`psm_utils.peptidoform.Peptidoform.apply_fixed_modifications` on all PSM peptidoforms in the :py:class:`PSMList`. - See also + See Also -------- psm_utils.peptidoform.Peptidoform.apply_fixed_modifications @@ -330,18 +363,20 @@ def to_dataframe(self) -> pd.DataFrame: def _is_iterable_of_bools(obj): + """Check if the object is an iterable of booleans.""" try: - if all(isinstance(x, (bool, np.bool_)) for x in obj): - return True - else: + if any(not isinstance(x, bool | np.bool_) for x in obj): return False + else: + return True except (TypeError, ValueError): return False def _is_iterable_of_ints(obj): + """Check if the object is an iterable of integers.""" try: - if not all(isinstance(x, (int, np.integer)) for x in obj): + if any(not isinstance(x, int | np.integer) for x in obj): return False else: return True @@ -350,8 +385,9 @@ def _is_iterable_of_ints(obj): def _is_iterable_of_strings(obj): + """Check if the object is an iterable of strings.""" try: - if not all(isinstance(x, str) for x in obj): + if any(not isinstance(x, str) for x in obj): return False else: return True diff --git a/psm_utils/utils.py b/psm_utils/utils.py index 7df9b33..82a65a1 100644 --- a/psm_utils/utils.py +++ b/psm_utils/utils.py @@ -1,11 +1,11 @@ """Various utility functions.""" -from typing import Optional +from __future__ import annotations -from pyteomics.mass import nist_mass +from pyteomics.mass import nist_mass # type: ignore[import-untyped] -def mass_to_mz(mass: float, charge: int, adduct_mass: Optional[float] = None) -> float: +def mass_to_mz(mass: float, charge: int, adduct_mass: float | None = None) -> float: """ Convert mass to m/z. @@ -20,11 +20,13 @@ def mass_to_mz(mass: float, charge: int, adduct_mass: Optional[float] = None) -> """ if adduct_mass is None: - adduct_mass = nist_mass["H"][1][0] - return (mass + charge * adduct_mass) / charge + _adduct_mass = nist_mass["H"][1][0] + else: + _adduct_mass = float(adduct_mass) + return (mass + charge * _adduct_mass) / charge -def mz_to_mass(mz: float, charge: int, adduct_mass: Optional[float] = None) -> float: +def mz_to_mass(mz: float, charge: int, adduct_mass: float | None = None) -> float: """ Convert m/z to mass. @@ -39,5 +41,7 @@ def mz_to_mass(mz: float, charge: int, adduct_mass: Optional[float] = None) -> f """ if adduct_mass is None: - adduct_mass = nist_mass["H"][1][0] - return mz * charge - charge * adduct_mass + _adduct_mass = nist_mass["H"][1][0] + else: + _adduct_mass = float(adduct_mass) + return mz * charge - charge * _adduct_mass diff --git a/pyproject.toml b/pyproject.toml index 39e1125..d4a07a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ classifiers = [ "Development Status :: 4 - Beta", ] dynamic = ["version"] -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = [ "click", "lxml", @@ -29,11 +29,11 @@ dependencies = [ "pydantic >= 2", "pyteomics >= 4", "rich", - "sqlalchemy", + "sqlalchemy >= 2", ] [project.optional-dependencies] -dev = ["ruff", "isort>5", "pytest", "pytest-cov"] +dev = ["ruff", "isort>5", "pytest", "pytest-cov", "mypy"] docs = [ "numpydoc>=1,<2", "recommonmark", @@ -71,8 +71,16 @@ profile = "black" [tool.black] line-length = 99 -target-version = ['py38'] +target-version = ['py310'] [tool.ruff] line-length = 99 -target-version = "py38" +target-version = "py310" + +[tool.ruff.lint] +select = ["E4", "E7", "E9", "F", "PD", "D", "UP"] +ignore = ["D203", "D212"] + +[tool.mypy] +files = ["psm_utils/**/*.py"] +install_types = true diff --git a/tests/test_data/minimal_test.msf b/tests/test_data/minimal_test.msf new file mode 100644 index 0000000..1a8bf89 Binary files /dev/null and b/tests/test_data/minimal_test.msf differ diff --git a/tests/test_data/minimal_v79_test.msf b/tests/test_data/minimal_v79_test.msf new file mode 100644 index 0000000..874f9ac Binary files /dev/null and b/tests/test_data/minimal_v79_test.msf differ diff --git a/tests/test_io/test_idxml.py b/tests/test_io/test_idxml.py index dae34da..a159716 100644 --- a/tests/test_io/test_idxml.py +++ b/tests/test_io/test_idxml.py @@ -1,7 +1,5 @@ """Tests for psm_utils.io.idxml.""" -import hashlib - import pytest from psm_utils.io.idxml import IdXMLReader, IdXMLWriter @@ -12,6 +10,15 @@ pyopenms = pytest.importorskip("pyopenms") +def _assert_float_equal(a: float | None, b: float | None, tolerance: float = 1e-5) -> None: + """Assert two float values are equal within tolerance, handling None values.""" + if a is None and b is None: + return + if a is None or b is None: + assert False, f"One value is None: {a} vs {b}" + assert abs(a - b) < tolerance, f"Values not equal within tolerance: {a} vs {b}" + + class TestIdXMLReader: def test__parse_peptidoform(self): test_cases = [ @@ -56,9 +63,19 @@ def test__parse_psm(self): "protein_references": "unique", }, rescoring_features={ + "MS:1002258": 3.0, + "MS:1002259": 12.0, + "num_matched_peptides": 35.0, + "isotope_error": 0.0, "MS:1002252": 0.693, + "COMET:xcorr": 0.693, + "MS:1002253": 1.0, "COMET:deltaCn": 1.0, "MS:1002255": 35.9, + "COMET:spscore": 35.9, + "MS:1002256": 1.0, + "COMET:sprank": 1.0, + "MS:1002257": 1.01, "COMET:deltaLCn": 0.0, "COMET:lnExpect": 0.009950330853168092, "COMET:lnNumSP": 3.555348061489414, @@ -79,9 +96,19 @@ def test__get_run(self): def test__get_rescoring_features(self): expected_output = [ + "MS:1002258", + "MS:1002259", + "num_matched_peptides", + "isotope_error", "MS:1002252", + "COMET:xcorr", + "MS:1002253", "COMET:deltaCn", "MS:1002255", + "COMET:spscore", + "MS:1002256", + "COMET:sprank", + "MS:1002257", "COMET:deltaLCn", "COMET:lnExpect", "COMET:lnNumSP", @@ -96,23 +123,170 @@ def test__get_rescoring_features(self): class TestIdXMLWriter: def test_write_file_with_pyopenms_objects(self): - expected_sha = "8d8cb6d8194c5c296f0f5ee8be83d2072be125547b2d51b88100859b001f47fa" + """Test writing idXML file with existing pyopenms objects and verify content.""" reader = IdXMLReader("./tests/test_data/test_in.idXML") - psm_list = reader.read_file() + original_psm_list = reader.read_file() + + # Write the file writer = IdXMLWriter( "./tests/test_data/test_out.idXML", reader.protein_ids, reader.peptide_ids ) - writer.write_file(psm_list) - sha = hashlib.sha256(open("./tests/test_data/test_out.idXML", "rb").read()).hexdigest() - assert sha == expected_sha + writer.write_file(original_psm_list) + + # Read back the written file and verify content + reader_check = IdXMLReader("./tests/test_data/test_out.idXML") + written_psm_list = reader_check.read_file() + + # Verify basic file structure + assert len(written_psm_list) == len(original_psm_list) + + # Compare key attributes of each PSM + for orig_psm, written_psm in zip(original_psm_list, written_psm_list): + assert str(orig_psm.peptidoform) == str(written_psm.peptidoform) + assert orig_psm.spectrum_id == written_psm.spectrum_id + assert orig_psm.run == written_psm.run + assert orig_psm.is_decoy == written_psm.is_decoy + _assert_float_equal(orig_psm.score, written_psm.score) + _assert_float_equal(orig_psm.precursor_mz, written_psm.precursor_mz) + _assert_float_equal(orig_psm.retention_time, written_psm.retention_time) + assert orig_psm.protein_list == written_psm.protein_list + assert orig_psm.rank == written_psm.rank + + # Check that rescoring features are preserved + if orig_psm.rescoring_features: + assert written_psm.rescoring_features is not None + for feature_name, feature_value in orig_psm.rescoring_features.items(): + assert feature_name in written_psm.rescoring_features + assert abs(written_psm.rescoring_features[feature_name] - feature_value) < 1e-6 def test_write_file_without_pyopenms_objects(self): - expected_sha = "148889926276fbe391e23ed7952c3a8410fc67ffb099bbf1a72df75f8d727ccd" #TODO: can cause problems locally depending on dependency versions + """Test writing idXML file from scratch without existing pyopenms objects.""" reader = SageTSVReader("./tests/test_data/results.sage.tsv") - psm_list = reader.read_file() + original_psm_list = reader.read_file() + + # Write the file writer = IdXMLWriter("./tests/test_data/test_out_sage.idXML") + writer.write_file(original_psm_list) + + # Read back the written file and verify content + reader_check = IdXMLReader("./tests/test_data/test_out_sage.idXML") + written_psm_list = reader_check.read_file() + + # Verify basic file structure + assert len(written_psm_list) == len(original_psm_list) + + # Compare key attributes of the first PSM (since sage data has one entry) + orig_psm = original_psm_list[0] + written_psm = written_psm_list[0] + + assert str(orig_psm.peptidoform) == str(written_psm.peptidoform) + assert orig_psm.spectrum_id == written_psm.spectrum_id + assert orig_psm.run == written_psm.run + assert orig_psm.is_decoy == written_psm.is_decoy + _assert_float_equal(orig_psm.score, written_psm.score) + _assert_float_equal(orig_psm.precursor_mz, written_psm.precursor_mz) + _assert_float_equal(orig_psm.retention_time, written_psm.retention_time) + assert orig_psm.protein_list == written_psm.protein_list + assert orig_psm.rank == written_psm.rank + + # Verify that the written file is a valid idXML (can be read without errors) + assert len(reader_check.protein_ids) > 0 + assert len(reader_check.peptide_ids) > 0 + + def test_write_file_preserves_modifications(self): + """Test that modifications are properly preserved when writing idXML files.""" + from psm_utils.psm_list import PSMList + + # Create test PSMs with various modifications + test_psms = [ + PSM( + peptidoform="ACDK/2", + spectrum_id="scan=1", + score=140.2, + retention_time=600.2, + precursor_mz=300.15, + run="test_run", + ), + PSM( + peptidoform="AC[Carbamidomethyl]DK/2", + spectrum_id="scan=2", + score=150.3, + retention_time=650.1, + precursor_mz=357.17, + run="test_run", + ), + PSM( + peptidoform="[Acetyl]-ACDK/2", + spectrum_id="scan=3", + score=120.8, + retention_time=580.5, + precursor_mz=342.16, + run="test_run", + ), + ] + + psm_list = PSMList(psm_list=test_psms) + + # Write and read back + writer = IdXMLWriter("./tests/test_data/test_mods.idXML") writer.write_file(psm_list) - sha = hashlib.sha256( - open("./tests/test_data/test_out_sage.idXML", "rb").read() - ).hexdigest() - assert sha == expected_sha + + reader_check = IdXMLReader("./tests/test_data/test_mods.idXML") + written_psm_list = reader_check.read_file() + + # Verify modifications are preserved + assert len(written_psm_list) == len(test_psms) + + for orig_psm, written_psm in zip(test_psms, written_psm_list): + # The peptidoform should be preserved (though the exact string representation might differ) + assert orig_psm.peptidoform.sequence == written_psm.peptidoform.sequence + assert ( + orig_psm.peptidoform.precursor_charge == written_psm.peptidoform.precursor_charge + ) + + # Basic properties should match + assert orig_psm.spectrum_id == written_psm.spectrum_id + _assert_float_equal(orig_psm.score, written_psm.score) + _assert_float_equal(orig_psm.retention_time, written_psm.retention_time) + _assert_float_equal(orig_psm.precursor_mz, written_psm.precursor_mz) + + def test_write_file_with_metadata_and_features(self): + """Test that metadata and rescoring features are preserved.""" + from psm_utils.psm_list import PSMList + + test_psm = PSM( + peptidoform="TESTPEPTIDE/2", + spectrum_id="scan=100", + score=200.5, + retention_time=1000.0, + precursor_mz=500.25, + run="feature_test", + qvalue=0.01, + pep=0.05, + metadata={"custom_meta": "test_value", "intensity": "12345"}, + rescoring_features={"custom_score": 0.85, "feature_2": 1.23}, + ) + + psm_list = PSMList(psm_list=[test_psm]) + + # Write and read back + writer = IdXMLWriter("./tests/test_data/test_features.idXML") + writer.write_file(psm_list) + + reader_check = IdXMLReader("./tests/test_data/test_features.idXML") + written_psm_list = reader_check.read_file() + + assert len(written_psm_list) == 1 + written_psm = written_psm_list[0] + + # Check basic attributes + assert str(test_psm.peptidoform) == str(written_psm.peptidoform) + assert test_psm.spectrum_id == written_psm.spectrum_id + _assert_float_equal(test_psm.score, written_psm.score) + _assert_float_equal(test_psm.qvalue, written_psm.qvalue) + _assert_float_equal(test_psm.pep, written_psm.pep) + + # Check that custom features are preserved + assert written_psm.rescoring_features is not None + assert "custom_score" in written_psm.rescoring_features + assert abs(written_psm.rescoring_features["custom_score"] - 0.85) < 1e-6 diff --git a/tests/test_io/test_peptide_record.py b/tests/test_io/test_peptide_record.py index c3e9393..b78fe61 100644 --- a/tests/test_io/test_peptide_record.py +++ b/tests/test_io/test_peptide_record.py @@ -5,29 +5,40 @@ from psm_utils.io.peptide_record import ( InvalidPeprecError, InvalidPeprecModificationError, - _PeptideRecord, + _analyze_peprec_file, peprec_to_proforma, ) -class TestPeptideRecord: - def test__infer_separator(self): +class TestPeprecFileAnalysis: + def test_analyze_peprec_file(self): # Tab - p = _PeptideRecord("./tests/test_data/peprec.tsv") - assert p.separator == "\t" + separator, header = _analyze_peprec_file("./tests/test_data/peprec.tsv") + assert separator == "\t" + assert "spec_id" in header + assert "peptide" in header + assert "modifications" in header # Comma - p = _PeptideRecord("./tests/test_data/peprec.csv") - assert p.separator == "," + separator, header = _analyze_peprec_file("./tests/test_data/peprec.csv") + assert separator == "," + assert "spec_id" in header + assert "peptide" in header + assert "modifications" in header # Space - p = _PeptideRecord("./tests/test_data/peprec.txt") - assert p.separator == " " + separator, header = _analyze_peprec_file("./tests/test_data/peprec.txt") + assert separator == " " + assert "spec_id" in header + assert "peptide" in header + assert "modifications" in header # Invalid: Mixed use of separators with pytest.raises(InvalidPeprecError): - p = _PeptideRecord("./tests/test_data/peprec_invalid.csv") + _analyze_peprec_file("./tests/test_data/peprec_invalid.csv") + +class TestPeprecToProforma: def test_peprec_to_proforma(self): # Valid cases valid_test_cases = [ @@ -38,7 +49,10 @@ def test_peprec_to_proforma(self): (("ACDMEK", "-1|Amidation"), "ACDMEK-[Amidation]"), (("ACDMEK", "4|Oxidation|-1|Amidation"), "ACDM[Oxidation]EK-[Amidation]"), (("ACDMEK", "0|Acetyl|4|Ox|-1|Amide"), "[Acetyl]-ACDM[Ox]EK-[Amide]"), - # (("ACDMEK", "6|Methylation|-1|Amide"), "ACDMEK[Methylation]-[Amide]"), # See levitsky/pyteomics/#77 + ( + ("ACDMEK", "6|Methylation|-1|Amide"), + "ACDMEK[Methylation]-[Amide]", + ), # See levitsky/pyteomics/#77 (("MCDMEK", "0|Acetyl|1|Oxidation"), "[Acetyl]-M[Oxidation]CDMEK"), (("ACDMEK", "", "2"), "ACDMEK/2"), ] diff --git a/tests/test_io/test_proteome_discoverer.py b/tests/test_io/test_proteome_discoverer.py new file mode 100644 index 0000000..485a192 --- /dev/null +++ b/tests/test_io/test_proteome_discoverer.py @@ -0,0 +1,554 @@ +""" +Tests for psm_utils.io.proteome_discoverer. + +This comprehensive test suite covers the MSFReader class through both unit tests with mocks +and integration tests with real MSF data. The test structure includes: + +1. Unit tests: Fast, isolated tests using mocks for individual methods +2. Integration tests: End-to-end tests using the minimal MSF test file +3. Error handling tests: Edge cases and error conditions +4. Performance tests: Validation of method behaviors with real data + +The test suite validates SQLAlchemy 2.0 patterns, proper data extraction, +and complete PSM object construction. +""" + +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from psm_utils import Peptidoform +from psm_utils.io.proteome_discoverer import COMPATIBLE_VERSIONS, MSFReader + + +class TestMSFReaderUnit: + """Unit tests for MSFReader using mocks - fast and isolated.""" + + @pytest.fixture + def mock_reader_setup(self): + """Set up common mocked MSFReader for tests.""" + with ( + patch("psm_utils.io.proteome_discoverer.create_engine") as mock_engine, + patch("psm_utils.io.proteome_discoverer.Session") as mock_session_class, + ): + mock_session = Mock() + mock_session_class.return_value = mock_session + + # Default version check + mock_session.execute.return_value.first.return_value = (79,) + + test_file = Path("test.msf") + test_file.touch() + + yield { + "mock_engine": mock_engine, + "mock_session_class": mock_session_class, + "mock_session": mock_session, + "test_file": test_file, + } + + test_file.unlink(missing_ok=True) + + def test_init_success(self, mock_reader_setup): + """Test successful MSFReader initialization.""" + setup = mock_reader_setup + + MSFReader(setup["test_file"]) + + # Verify SQLAlchemy setup + setup["mock_engine"].assert_called_once() + setup["mock_session_class"].assert_called_once() + + # Verify version check was called + assert setup["mock_session"].execute.called + + def test_init_no_version_info(self, mock_reader_setup): + """Test initialization when MSF file has no version information.""" + setup = mock_reader_setup + setup["mock_session"].execute.return_value.first.return_value = None + + with patch("psm_utils.io.proteome_discoverer.logger") as mock_logger: + MSFReader(setup["test_file"]) + + mock_logger.warning.assert_called_once() + warning_msg = mock_logger.warning.call_args[0][0] + assert "does not contain version information" in warning_msg + + @pytest.mark.parametrize("version", [79, 53, 8]) + def test_compatible_versions(self, mock_reader_setup, version): + """Test that compatible versions don't raise warnings.""" + setup = mock_reader_setup + setup["mock_session"].execute.return_value.first.return_value = (version,) + + with patch("psm_utils.io.proteome_discoverer.logger") as mock_logger: + MSFReader(setup["test_file"]) + + # Should not log any warnings for compatible versions + mock_logger.warning.assert_not_called() + + def test_incompatible_version_warning(self, mock_reader_setup): + """Test warning for incompatible MSF version.""" + setup = mock_reader_setup + setup["mock_session"].execute.return_value.first.return_value = (999,) + + with patch("psm_utils.io.proteome_discoverer.logger") as mock_logger: + MSFReader(setup["test_file"]) + + mock_logger.warning.assert_called_once() + warning_msg = mock_logger.warning.call_args[0][0] + assert "version 999 might not be compatible" in warning_msg + + def test_len_method(self, mock_reader_setup): + """Test __len__ method with mocked counts.""" + setup = mock_reader_setup + + def mock_execute_side_effect(stmt): + stmt_str = str(stmt) + mock_result = Mock() + + if "SchemaInfo" in stmt_str and "Version" in stmt_str: + mock_result.first.return_value = (79,) + elif "Peptides_decoy" in stmt_str: + mock_result.scalar.return_value = 150 + else: # Regular Peptides table + mock_result.scalar.return_value = 1000 + + return mock_result + + setup["mock_session"].execute.side_effect = mock_execute_side_effect + + reader = MSFReader(setup["test_file"]) + assert len(reader) == 1150 # 1000 + 150 + + def test_get_modifications_structure(self, mock_reader_setup): + """Test _get_modifications method structure and return format.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + # Reset and mock modification query + setup["mock_session"].execute.reset_mock() + mock_results = [ + (1, 0, 4), # PeptideID 1, Position 0, UnimodAccession 4 + (1, 3, 35), # PeptideID 1, Position 3, UnimodAccession 35 + (2, 1, 21), # PeptideID 2, Position 1, UnimodAccession 21 + ] + setup["mock_session"].execute.return_value = mock_results + + modifications = reader._get_modifications(is_decoy=False) + + assert isinstance(modifications, dict) + assert modifications[1] == [(0, 4), (3, 35)] + assert modifications[2] == [(1, 21)] + assert len(modifications[1]) == 2 + assert len(modifications[2]) == 1 + + def test_get_terminal_modifications_structure(self, mock_reader_setup): + """Test _get_terminal_modifications method structure.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + setup["mock_session"].execute.reset_mock() + mock_results = [ + (1, 1, 4), # PeptideID 1, PositionType 1 (N-term), UnimodAccession 4 + (2, 2, 17), # PeptideID 2, PositionType 2 (C-term), UnimodAccession 17 + ] + setup["mock_session"].execute.return_value = mock_results + + terminal_mods = reader._get_terminal_modifications(is_decoy=False) + + assert isinstance(terminal_mods, dict) + assert terminal_mods[1] == [(1, 4)] + assert terminal_mods[2] == [(2, 17)] + + def test_get_protein_entries_structure(self, mock_reader_setup): + """Test _get_protein_entries method structure.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + setup["mock_session"].execute.reset_mock() + mock_results = [ + (1, "sp|P12345|PROT1_HUMAN Protein 1"), + (1, "sp|Q67890|PROT2_HUMAN Protein 2"), + (2, ">tr|R12345|PROT3_HUMAN Protein 3"), + ] + setup["mock_session"].execute.return_value = mock_results + + proteins = reader._get_protein_entries(is_decoy=False) + + assert isinstance(proteins, dict) + assert len(proteins[1]) == 2 + assert proteins[1][0] == "sp|P12345|PROT1_HUMAN Protein 1" + assert proteins[2][0] == "tr|R12345|PROT3_HUMAN Protein 3" # ">" should be removed + + def test_get_main_score_structure(self, mock_reader_setup): + """Test _get_main_score method structure.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + setup["mock_session"].execute.reset_mock() + mock_results = [ + (1, 95.5, "XCorr"), + (2, 88.2, "Mascot Score"), + ] + setup["mock_session"].execute.return_value = mock_results + + scores = reader._get_main_score(is_decoy=False) + + assert isinstance(scores, dict) + assert scores[1] == (95.5, "XCorr") + assert scores[2] == (88.2, "Mascot Score") + + def test_get_secondary_scores_structure(self, mock_reader_setup): + """Test _get_secondary_scores method structure.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + setup["mock_session"].execute.reset_mock() + mock_results = [ + (1, 0.95, "Confidence"), + (1, 15.2, "Delta Score"), + (2, 0.88, "Confidence"), + ] + setup["mock_session"].execute.return_value = mock_results + + scores = reader._get_secondary_scores(is_decoy=False) + + assert isinstance(scores, dict) + assert scores[1]["Confidence"] == 0.95 + assert scores[1]["Delta Score"] == 15.2 + assert scores[2]["Confidence"] == 0.88 + + @pytest.mark.parametrize( + "sequence,charge", + [ + ("PEPTIDE", 2), + ("METHYLATION", 3), + ("ACDEFGHIKLMNPQRSTVWY", 4), + ], + ) + def test_compile_peptidoform_basic(self, mock_reader_setup, sequence, charge): + """Test _compile_peptidoform with various basic sequences.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + peptidoform = reader._compile_peptidoform( + sequence=sequence, charge=charge, modifications=[], terminal_modifications=[] + ) + + assert isinstance(peptidoform, Peptidoform) + assert sequence in str(peptidoform) + + def test_compile_peptidoform_with_modifications(self, mock_reader_setup): + """Test _compile_peptidoform with amino acid and terminal modifications.""" + setup = mock_reader_setup + reader = MSFReader(setup["test_file"]) + + peptidoform = reader._compile_peptidoform( + sequence="PEPTIDE", + charge=2, + modifications=[(0, 4), (3, 35)], # Acetyl at pos 0, Oxidation at pos 3 + terminal_modifications=[(1, 1), (2, 17)], # N-term and C-term mods + ) + + assert isinstance(peptidoform, Peptidoform) + # Verify modifications are included in the peptidoform + peptidoform_str = str(peptidoform) + # The sequence will be modified with UNIMOD annotations + assert any(aa in peptidoform_str for aa in "PEPTIDE") + assert "UNIMOD" in peptidoform_str # Should contain modification annotations + + def test_compatible_versions_constant(self): + """Test COMPATIBLE_VERSIONS constant is properly defined.""" + assert isinstance(COMPATIBLE_VERSIONS, list) + assert len(COMPATIBLE_VERSIONS) > 0 + assert all(isinstance(v, int) for v in COMPATIBLE_VERSIONS) + assert 79 in COMPATIBLE_VERSIONS + assert 53 in COMPATIBLE_VERSIONS + assert 8 in COMPATIBLE_VERSIONS + + +class TestMSFReaderIntegration: + """Integration tests using the real minimal MSF file.""" + + @pytest.fixture + def minimal_msf_path(self): + """Path to the minimal MSF test file.""" + path = Path(__file__).parent.parent / "test_data" / "minimal_v79_test.msf" + if not path.exists(): + pytest.skip("Minimal MSF test file not found") + return path + + @pytest.fixture + def reader(self, minimal_msf_path): + """MSFReader instance with minimal test file.""" + return MSFReader(minimal_msf_path) + + def test_initialization_with_real_file(self, minimal_msf_path): + """Test successful initialization with real MSF file.""" + reader = MSFReader(minimal_msf_path) + assert reader is not None + assert reader.filename == minimal_msf_path + + def test_len_with_real_data(self, reader): + """Test __len__ method with real MSF data.""" + psm_count = len(reader) + assert psm_count > 0 + assert isinstance(psm_count, int) + + def test_iteration_yields_correct_count(self, reader): + """Test that iteration yields the same number of PSMs as len().""" + expected_count = len(reader) + actual_psms = list(reader) + assert len(actual_psms) == expected_count + + def test_psm_structure_and_types(self, reader): + """Test that PSMs have correct structure and data types.""" + psms = list(reader) + assert len(psms) > 0 + + first_psm = psms[0] + + # Test required attributes exist + assert hasattr(first_psm, "peptidoform") + assert hasattr(first_psm, "spectrum_id") + assert hasattr(first_psm, "run") + assert hasattr(first_psm, "is_decoy") + assert hasattr(first_psm, "score") + assert hasattr(first_psm, "precursor_mz") + assert hasattr(first_psm, "retention_time") + assert hasattr(first_psm, "protein_list") + assert hasattr(first_psm, "rank") + assert hasattr(first_psm, "source") + assert hasattr(first_psm, "metadata") + assert hasattr(first_psm, "rescoring_features") + + # Test data types + assert isinstance(first_psm.peptidoform, Peptidoform) + assert isinstance(first_psm.is_decoy, bool) + assert isinstance(first_psm.score, int | float) + assert isinstance(first_psm.protein_list, list) + assert isinstance(first_psm.rank, int) + assert first_psm.source == "proteome_discoverer" + assert isinstance(first_psm.metadata, dict) + assert isinstance(first_psm.rescoring_features, dict) + + def test_target_and_decoy_psms(self, reader): + """Test that both target and decoy PSMs are present (if available).""" + psms = list(reader) + + target_psms = [psm for psm in psms if not psm.is_decoy] + decoy_psms = [psm for psm in psms if psm.is_decoy] + + # At least one type should be present + assert len(target_psms) > 0 or len(decoy_psms) > 0 + + # If both are present, verify they have the expected structure + if target_psms and decoy_psms: + assert len(target_psms) > 0 + assert len(decoy_psms) > 0 + + def test_psm_metadata_content(self, reader): + """Test that PSM metadata contains expected keys.""" + psms = list(reader) + first_psm = psms[0] + + # Test required metadata keys + required_metadata_keys = [ + "ms1_intensity", + "ms1_percent_isolation_interference", + "ms1_ion_inject_time", + "main_score_name", + ] + + for key in required_metadata_keys: + assert key in first_psm.metadata + + def test_rescoring_features_content(self, reader): + """Test that rescoring features contain expected data.""" + psms = list(reader) + first_psm = psms[0] + + # Test required rescoring feature keys + required_rescoring_keys = ["missed_cleavages", "total_ions_count", "matched_ions_count"] + + for key in required_rescoring_keys: + assert key in first_psm.rescoring_features + # Values can be int or float depending on database storage + assert isinstance(first_psm.rescoring_features[key], int | float) + + def test_peptidoform_sequences_valid(self, reader): + """Test that peptidoform sequences contain valid amino acids.""" + psms = list(reader) + + valid_amino_acids = set("ACDEFGHIKLMNPQRSTVWY") + + for psm in psms[:5]: # Test first 5 PSMs + peptidoform_str = str(psm.peptidoform) + # Extract base sequence (before any charge or modification info) + sequence = peptidoform_str.split("/")[0] + # Remove any modification annotations + clean_sequence = "".join(c for c in sequence if c.isalpha()) + + # All characters should be valid amino acids + assert all(aa in valid_amino_acids for aa in clean_sequence), ( + f"Invalid amino acids in sequence: {clean_sequence}" + ) + + def test_unique_spectrum_ids(self, reader): + """Test that spectrum IDs are unique (within decoy/target groups).""" + psms = list(reader) + + target_spectrum_ids = {psm.spectrum_id for psm in psms if not psm.is_decoy} + decoy_spectrum_ids = {psm.spectrum_id for psm in psms if psm.is_decoy} + + target_psms = [psm for psm in psms if not psm.is_decoy] + decoy_psms = [psm for psm in psms if psm.is_decoy] + + # Within each group, spectrum IDs should be unique per PSM + if target_psms: + assert len(target_spectrum_ids) <= len(target_psms) + if decoy_psms: + assert len(decoy_spectrum_ids) <= len(decoy_psms) + + def test_score_values_reasonable(self, reader): + """Test that score values are reasonable numbers.""" + psms = list(reader) + + for psm in psms: + assert isinstance(psm.score, int | float) + assert not (psm.score != psm.score) # Check for NaN + # Scores should be finite + assert abs(psm.score) < float("inf") + + +class TestMSFReaderErrorHandling: + """Test error handling and edge cases.""" + + def test_nonexistent_file(self): + """Test handling of nonexistent MSF file.""" + with pytest.raises(Exception): # Should raise some form of file not found error + MSFReader("nonexistent_file.msf") + + @patch("psm_utils.io.proteome_discoverer.create_engine") + def test_database_connection_error(self, mock_create_engine): + """Test handling of database connection errors.""" + mock_create_engine.side_effect = Exception("Database connection failed") + + test_file = Path("test.msf") + test_file.touch() + + try: + with pytest.raises(Exception): + MSFReader(test_file) + finally: + test_file.unlink() + + def test_read_file_method(self, minimal_msf_path): + """Test the read_file() method returns PSMList.""" + reader = MSFReader(minimal_msf_path) + psm_list = reader.read_file() + + # Should return a list-like object + assert hasattr(psm_list, "__iter__") + assert hasattr(psm_list, "__len__") + assert len(psm_list) > 0 + + def test_reader_reusability(self, minimal_msf_path): + """Test that MSFReader can be reused for multiple operations.""" + reader = MSFReader(minimal_msf_path) + + # Multiple length checks should work + psm_count1 = len(reader) + psm_count2 = len(reader) + assert psm_count1 == psm_count2 + assert psm_count1 > 0 + + def test_multiple_iterations(self, minimal_msf_path): + """Test that multiple iterations over the same reader work consistently.""" + reader = MSFReader(minimal_msf_path) + + first_iteration = list(reader) + second_iteration = list(reader) + + assert len(first_iteration) == len(second_iteration) + assert len(first_iteration) > 0 + + @pytest.fixture + def minimal_msf_path(self): + """Path to the minimal MSF test file.""" + path = Path(__file__).parent.parent / "test_data" / "minimal_v79_test.msf" + if not path.exists(): + pytest.skip("Minimal MSF test file not found") + return path + + +class TestMSFReaderPerformance: + """Performance and stress tests for MSFReader.""" + + @pytest.fixture + def minimal_msf_path(self): + """Path to the minimal MSF test file.""" + path = Path(__file__).parent.parent / "test_data" / "minimal_v79_test.msf" + if not path.exists(): + pytest.skip("Minimal MSF test file not found") + return path + + def test_lazy_iteration_memory_efficiency(self, minimal_msf_path): + """Test that iteration is memory efficient (doesn't load all PSMs at once).""" + reader = MSFReader(minimal_msf_path) + + # Should be able to iterate without loading everything into memory + psm_count = 0 + for psm in reader: + psm_count += 1 + if psm_count > 5: # Just test first few PSMs + break + + assert psm_count > 0 + + def test_consistent_psm_ordering(self, minimal_msf_path): + """Test that PSM ordering is consistent across iterations.""" + reader = MSFReader(minimal_msf_path) + + first_batch = [] + for i, psm in enumerate(reader): + first_batch.append(psm.spectrum_id) + if i >= 4: # First 5 PSMs + break + + second_batch = [] + for i, psm in enumerate(reader): + second_batch.append(psm.spectrum_id) + if i >= 4: # First 5 PSMs + break + + assert first_batch == second_batch, "PSM ordering should be consistent" + + def test_all_required_psm_attributes(self, minimal_msf_path): + """Test that all PSMs have all required attributes populated.""" + reader = MSFReader(minimal_msf_path) + + required_attrs = [ + "peptidoform", + "spectrum_id", + "run", + "is_decoy", + "score", + "precursor_mz", + "retention_time", + "protein_list", + "rank", + "source", + "metadata", + "rescoring_features", + ] + + for i, psm in enumerate(reader): + for attr in required_attrs: + assert hasattr(psm, attr), f"PSM {i} missing attribute: {attr}" + # None values are acceptable, but attribute must exist + getattr(psm, attr) + + if i >= 9: # Test first 10 PSMs + break