Skip to content

Commit

Permalink
Merge pull request #11 from MannLabs/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
jalew188 authored Aug 18, 2023
2 parents 35b5fda + c1a2ee2 commit 5d6ebbf
Show file tree
Hide file tree
Showing 28 changed files with 1,647 additions and 353 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.1.0
current_version = 0.1.1
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/publish_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
cd release/pypi
. ./prepare_pypi_wheel.sh
- name: Publish distribution to Test PyPI
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.TEST_PYPI_ALPHARAW_TOKEN }}
repository_url: https://test.pypi.org/legacy/
Expand All @@ -54,7 +54,7 @@ jobs:
cd release/pypi
. ./install_test_pypi_wheel.sh
- name: Publish distribution to PyPI
uses: pypa/gh-action-pypi-publish@master
uses: pypa/gh-action-pypi-publish@release/v1
with:
password: ${{ secrets.PYPI_ALPHARAW_TOKEN }}
Test_PyPi_Release:
Expand Down
Empty file removed HISTORY.md
Empty file.
17 changes: 9 additions & 8 deletions alpharaw/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#!python

try:
def register_readers():
from .ms_data_base import ms_reader_provider
from .sciex import SciexWiffData
from .thermo import ThermoRawData
from .legacy_msdata import mgf
from .legacy_msdata import mzml
from .mzml import MzMLReader
from .wrappers import alphapept_wrapper
except ImportError:
pass
try:
from .sciex import SciexWiffData
from .thermo import ThermoRawData
except (RuntimeError, ImportError):
return "[WARN] pythonnet is not installed"

__project__ = "alpharaw"
__version__ = "0.1.0"
__version__ = "0.1.1"
__license__ = "Apache"
__description__ = "An open-source Python package to unify raw MS data accession and storage."
__author__ = "Mann Labs"
Expand Down Expand Up @@ -49,5 +50,5 @@
# "Scientific paper": None,
}
__extra_requirements__ = {
"development": "requirements_development.txt",
"development": "extra_requirements/development.txt",
}
165 changes: 165 additions & 0 deletions alpharaw/bruker/ap_ff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import pandas as pd
import numpy as np
import sqlalchemy as db

import subprocess
import os
import platform

from tqdm import tqdm


def extract_bruker(file:str, base_dir:str = "ext/bruker/FF", config:str = "proteomics_4d.config"):
"""Call Bruker Feautre Finder via subprocess.
Args:
file (str): Filename for feature finding.
base_dir (str, optional): Base dir where the feature finder is stored.. Defaults to "ext/bruker/FF".
config (str, optional): Config file for feature finder. Defaults to "proteomics_4d.config".
Raises:
NotImplementedError: Unsupported operating system.
FileNotFoundError: Feature finder not found.
FileNotFoundError: Config file not found.
FileNotFoundError: Feature file not found.
"""
feature_path = file + '/'+ os.path.split(file)[-1] + '.features'

base_dir = os.path.join(os.path.dirname(__file__), base_dir)

operating_system = platform.system()

if operating_system == 'Linux':
ff_dir = os.path.join(base_dir, 'linux64','uff-cmdline2')
print('Using Linux FF')
elif operating_system == 'Windows':
ff_dir = os.path.join(base_dir, 'win64','uff-cmdline2.exe')
print('Using Windows FF')
else:
raise NotImplementedError(f"System {operating_system} not supported.")

if os.path.exists(feature_path):
return feature_path
else:
if not os.path.isfile(ff_dir):
raise FileNotFoundError(f'Bruker feature finder cmd not found here {ff_dir}.')

config_path = base_dir + '/'+ config

if not os.path.isfile(config_path):
raise FileNotFoundError(f'Config file not found here {config_path}.')

if operating_system == 'Windows':
FF_parameters = [ff_dir,'--ff 4d',f'--readconfig "{config_path}"', f'--analysisDirectory "{file}"']

process = subprocess.Popen(' '.join(FF_parameters), stdout=subprocess.PIPE)
for line in iter(process.stdout.readline, b''):
logtxt = line.decode('utf8')
print(logtxt[48:].rstrip()) #Remove logging info from FF
elif operating_system == 'Linux':
FF_parameters = [
ff_dir,
'--ff',
'4d',
'--readconfig',
config_path,
'--analysisDirectory',
file
]
process = subprocess.run(FF_parameters, stdout=subprocess.PIPE)

if os.path.exists(feature_path):
return feature_path
else:
raise FileNotFoundError(f"Feature file {feature_path} does not exist.")

def convert_bruker(feature_path:str)->pd.DataFrame:
"""Reads feature table and converts to feature table to be used with AlphaPept.
Args:
feature_path (str): Path to the feature file from Bruker FF (.features-file).
Returns:
pd.DataFrame: DataFrame containing features information.
"""
engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path))
feature_table = pd.read_sql_table('LcTimsMsFeature', engine_featurefile)
feature_cluster_mapping = pd.read_sql_table('FeatureClusterMapping', engine_featurefile)

# feature_table['Mass'] = feature_table['MZ'].values * feature_table['Charge'].values - feature_table['Charge'].values*M_PROTON
feature_table = feature_table.rename(columns={
"MZ": "mz","Mass": "mass", "RT": "rt_apex",
"RT_lower":"rt_start", "RT_upper":"rt_end",
"Mobility": "mobility", "Mobility_lower": "mobility_lower",
"Mobility_upper": "mobility_upper", "Charge":"charge",
"Intensity":'ms1_int_sum_apex',"ClusterCount":'n_isotopes'
})
feature_table['rt_apex'] = feature_table['rt_apex']/60
feature_table['rt_start'] = feature_table['rt_start']/60
feature_table['rt_end'] = feature_table['rt_end']/60

feature_cluster_mapping = feature_cluster_mapping.rename(columns={
"FeatureId": "feature_id", "ClusterId": "cluster_id",
"Monoisotopic": "monoisotopic", "Intensity": "ms1_int_sum_apex"
})

return feature_table, feature_cluster_mapping


def map_bruker(feature_path:str, feature_table:pd.DataFrame, query_data:dict)->pd.DataFrame:
"""Map Ms1 to Ms2 via Table FeaturePrecursorMapping from Bruker FF.
Args:
feature_path (str): Path to the feature file from Bruker FF (.features-file).
feature_table (pd.DataFrame): Pandas DataFrame containing the features.
query_data (dict): Data structure containing the query data.
Returns:
pd.DataFrame: DataFrame containing features information.
"""
engine_featurefile = db.create_engine('sqlite:///{}'.format(feature_path))

mapping = pd.read_sql_table('FeaturePrecursorMapping', engine_featurefile)
mapping = mapping.set_index('PrecursorId')
feature_table= feature_table.set_index('Id')


query_prec_id = query_data['prec_id']

#Now look up the feature for each precursor

mass_matched = []
mz_matched = []
rt_matched = []
query_idx = []
f_idx = []

for idx, prec_id in tqdm(enumerate(query_prec_id)):
try:
f_id = mapping.loc[prec_id]['FeatureId']
all_matches = feature_table.loc[f_id]
if type(f_id) == np.int64:
match = all_matches
mz_matched.append(match['mz'])
rt_matched.append(match['rt_apex'])
mass_matched.append(match['mass'])
query_idx.append(idx)
f_idx.append(match['FeatureId'])

else:
for k in range(len(all_matches)):
match = all_matches.iloc[k]
mz_matched.append(match['mz'])
rt_matched.append(match['rt_apex'])
mass_matched.append(match['mass'])
query_idx.append(idx)
f_idx.append(match['FeatureId'])

except KeyError:
pass

features = pd.DataFrame(np.array([mass_matched, mz_matched, rt_matched, query_idx, f_idx]).T, columns = ['mass_matched', 'mz_matched', 'rt_matched', 'query_idx', 'feature_idx'])

features['query_idx'] = features['query_idx'].astype('int')

return features
69 changes: 40 additions & 29 deletions alpharaw/legacy_msdata/mgf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from alpharaw.ms_data_base import (
index_ragged_list, MSData_Base,
ms_reader_provider
ms_reader_provider,
PEAK_MZ_DTYPE, PEAK_INTENSITY_DTYPE
)

def read_until(file, until):
Expand Down Expand Up @@ -33,13 +34,16 @@ def _import(self, _path:str):
f = open(_path)
else:
f = _path
scanset = set()
scan_mz_dict = {}
scan_charge_dict = {}
masses_list = []
intens_list = []
spec_idx_list = []
scan_list = []
rt_list = []
precursor_mz_list = []
charge_list = []
self._has_chimeras = False
while True:
line = f.readline()
if not line: break
Expand All @@ -63,33 +67,40 @@ def _import(self, _path:str):
elif line.startswith('PEPMASS='):
precursor_mz = float(line.split('=')[1])
elif line.startswith('CHARGE='):
charge = float(line.split('=')[1].strip()[:-1])
charge = int(line.split('=')[1].strip()[:-1])
if not scan:
title = find_line(lines, 'TITLE=')
scan = parse_scan_from_TITLE(title)
if scan in scanset: continue
scanset.add(scan)
if scan in scan_mz_dict:
scan_mz_dict[scan].append(precursor_mz)
scan_charge_dict[scan].append(charge)
self._has_chimeras = True
continue
scan_mz_dict[scan] = [precursor_mz]
scan_charge_dict[scan] = [charge]
scan_list.append(scan)
spec_idx_list.append(scan-1)
rt_list.append(RT)
precursor_mz_list.append(precursor_mz)
charge_list.append(charge)
masses_list.append(np.array(masses))
intens_list.append(np.array(intens))
masses_list.append(np.array(masses, dtype=PEAK_MZ_DTYPE))
intens_list.append(np.array(intens, dtype=PEAK_INTENSITY_DTYPE))
if isinstance(_path, str):
f.close()

precursor_mz_list = np.array(precursor_mz_list)
if self._has_chimeras:
precursor_mz_list = [scan_mz_dict[scan] for scan in scan_list]
charge_list = [scan_charge_dict[scan] for scan in scan_list]

return {
'peak_indices': index_ragged_list(masses_list),
'peak_mz': np.concatenate(masses_list),
'peak_intensity': np.concatenate(intens_list),
'rt': np.array(rt_list),
'precursor_mz': precursor_mz_list,
'isolation_mz_lower': precursor_mz_list-2,
'isolation_mz_upper': precursor_mz_list+2,
'spec_idx': np.array(spec_idx_list, dtype=np.int64),
'precursor_charge': np.array(charge_list, dtype=np.int8),
'scan': np.array(scan_list, dtype=np.int64),
'precursor_charge': charge_list,
}

def _set_dataframes(self, raw_data:dict):
Expand All @@ -102,28 +113,28 @@ def _set_dataframes(self, raw_data:dict):
end_idxes[spec_idxes] = raw_data['peak_indices'][1:]
rt_values = np.zeros(spec_num)
rt_values[spec_idxes] = raw_data['rt']
precursor_mzs = np.zeros(spec_num)
precursor_mzs[spec_idxes] = raw_data['precursor_mz']
mz_lowers = np.zeros(spec_num)
mz_lowers[spec_idxes] = raw_data['isolation_mz_lower']
mz_uppers = np.zeros(spec_num)
mz_uppers[spec_idxes] = raw_data['isolation_mz_upper']
charges = np.zeros(spec_num, np.int8)
charges[spec_idxes] = raw_data['precursor_charge']
if self._has_chimeras:
precursor_mzs = [[]]*spec_num
charges = [[]]*spec_num
mz_vals = raw_data['precursor_mz']
ch_vals = raw_data["precursor_charge"]
for i,idx in enumerate(spec_idxes):
precursor_mzs[idx] = mz_vals[i]
charges[idx] = ch_vals[i]
else:
precursor_mzs = np.zeros(spec_num)
precursor_mzs[spec_idxes] = raw_data['precursor_mz']
charges = np.zeros(spec_num, np.int8)
charges[spec_idxes] = raw_data['precursor_charge']

self.spectrum_df["charge"] = charges
self.spectrum_df["precursor_mz"] = precursor_mzs

self.set_peaks_by_cat_array(
self.set_peak_df_by_indexed_array(
raw_data['peak_mz'],
raw_data['peak_intensity'],
start_idxes,end_idxes
)
self.add_column_in_spec_df('rt', rt_values)
self.add_column_in_spec_df('charge', charges)
self.add_column_in_spec_df_by_spec_idxes('rt', raw_data['rt'], spec_idxes, na_value=0)
self.spectrum_df['ms_level'] = 2
self.set_precursor_mz(
precursor_mzs
)
self.set_precursor_mz_windows(
mz_lowers,mz_uppers
)

ms_reader_provider.register_reader('mgf', MGFReader)
1 change: 0 additions & 1 deletion alpharaw/legacy_msdata/mzml.py

This file was deleted.

Loading

0 comments on commit 5d6ebbf

Please sign in to comment.