Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
Pampa Valentin
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Hottlet Valentin
Pampa Valentin
Commits
a52051f6
Unverified
Commit
a52051f6
authored
5 months ago
by
touzet
Committed by
GitHub
5 months ago
Browse files
Options
Downloads
Patches
Plain Diff
Delete src/known_markers.py
parent
007cd524
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/known_markers.py
+0
-166
0 additions, 166 deletions
src/known_markers.py
with
0 additions
and
166 deletions
src/known_markers.py
deleted
100644 → 0
+
0
−
166
View file @
007cd524
"""
known_markers.py
"""
from
src
import
utils
as
ut
from
src
import
compute_masses
as
mass
from
src
import
sequences
from
src
import
markers
def
hamming_distance
(
seq1
,
seq2
):
if
len
(
seq1
)
!=
len
(
seq2
):
return
-
1
return
sum
(
c1
!=
c2
for
c1
,
c2
in
zip
(
seq1
,
seq2
))
def
find_markers_single_sequence
(
seq
,
set_of_digested_peptides
,
dict_of_model_markers
,
set_of_markers
):
# seq: protein (object defined in sequences.py)
# dict_of_model_markers, dictionary, key: peptide sequence [string], value: set of 3-uplets (PTM, code, gene_name)
# dict_of_taxid, key: (sequence,PTM), value: set of taxid
# for each marker of dict_of_model_markers (characterized by a *code*) find the best location in the sequence seq.
# output: set of markers
set_of_found_PTM_codes
=
set
()
# contains the set of pairs (PTM,code) for model markers found in the current sequence
set_of_raw_digested_peptides
=
{
pep
.
sequence
for
pep
in
set_of_digested_peptides
}
# markers already present in the set of markers (same PTM, code and taxid)
set_of_new_markers
=
{
m
for
m
in
set_of_markers
if
seq
.
taxid
==
m
.
taxid
}
set_of_unknown_sequences
=
set
()
for
m
in
set_of_new_markers
:
# markers for the same taxid
if
m
.
sequence
not
in
set_of_raw_digested_peptides
:
set_of_unknown_sequences
.
add
(
m
)
# add the marker + comment (cleavage site missing)
else
:
pos
=
(
seq
.
sequence
).
find
(
m
.
sequence
)
if
pos
>=
0
:
m
.
begin
=
str
(
pos
+
1
)
m
.
seqid
=
seq
.
seqid
m
.
end
=
str
(
pos
+
len
(
m
.
sequence
))
if
m
.
mass
==
None
:
m
.
mass
=
str
(
mass
.
peptide_mass_with_PTM
(
m
.
sequence
,
m
.
ptm
))
set_of_found_PTM_codes
.
add
((
m
.
ptm
,
m
.
code
))
set_of_new_markers
.
difference
(
set_of_unknown_sequences
)
found_markers
=
{}
# key: name of the marker, value: triplet (position, hamming distance, marker sequence)
#markers found with exact match in some other organism
for
marker_seq
in
dict_of_model_markers
:
pos
=
(
seq
.
sequence
).
find
(
marker_seq
)
if
(
pos
>=
0
):
set_of_PTM_codes
=
{(
s
[
0
],
s
[
1
])
for
s
in
dict_of_model_markers
[
marker_seq
]
if
seq
.
protein
==
s
[
2
]
}
for
PTM_code
in
set_of_PTM_codes
:
if
PTM_code
not
in
set_of_found_PTM_codes
:
found_markers
[
PTM_code
]
=
(
pos
,
0
,
marker_seq
)
set_of_found_PTM_codes
.
update
(
found_markers
.
keys
())
set_of_found_codes
=
{
s
[
1
]
for
s
in
set_of_found_PTM_codes
}
# other markers
for
marker_seq
in
dict_of_model_markers
.
keys
():
set_of_protein_names
=
{
s
[
2
]
for
s
in
dict_of_model_markers
[
marker_seq
]}
if
seq
.
protein
not
in
set_of_protein_names
:
continue
set_of_PTM_codes
=
{(
s
[
0
],
s
[
1
])
for
s
in
dict_of_model_markers
[
marker_seq
]}
set_of_codes
=
{
s
[
1
]
for
s
in
dict_of_model_markers
[
marker_seq
]}
if
set_of_codes
<
set_of_found_codes
:
continue
l
=
len
(
marker_seq
)
for
pos
in
range
(
0
,
len
(
seq
.
sequence
)
-
l
):
d
=
hamming_distance
((
seq
.
sequence
)[
pos
:
pos
+
l
],
marker_seq
)
if
d
<
len
(
marker_seq
)
/
10
+
1
:
for
PTM_code
in
set_of_PTM_codes
:
if
(
PTM_code
not
in
found_markers
)
or
(
d
<
found_markers
[
PTM_code
][
1
]):
found_markers
[
PTM_code
]
=
(
pos
,
d
,
marker_seq
)
for
code
in
found_markers
:
(
pos
,
d
,
marker_seq
)
=
found_markers
[
code
]
l
=
len
(
marker_seq
)
new_sequence
=
seq
.
sequence
[
pos
:
pos
+
l
]
if
new_sequence
in
set_of_raw_digested_peptides
:
found
=
True
new_cleavage
=
False
else
:
found
=
False
for
peptide
in
set_of_raw_digested_peptides
:
seq_pos
=
peptide
.
find
(
new_sequence
)
if
(
seq_pos
>=
0
):
new_sequence
=
peptide
pos
=
pos
+
seq_pos
l
=
len
(
peptide
)
found
=
True
new_cleavage
=
True
if
not
found
:
continue
# this loop seems weird, but is actually useful.
# this is due to the fact that a given marker_name can have multiple PTMs. In this case, each possibility gives rise to a distinct marker.
for
model_marker
in
dict_of_model_markers
[
marker_seq
]:
if
((
model_marker
[
0
],
model_marker
[
1
]))
!=
code
:
continue
new_marker
=
markers
.
Marker
()
new_marker
.
taxid
=
seq
.
taxid
new_marker
.
taxon_name
=
seq
.
taxon_name
new_marker
.
seqid
=
seq
.
seqid
new_marker
.
sequence
=
new_sequence
new_marker
.
begin
=
str
(
pos
+
1
)
helical_start
=
sequences
.
helical_region
(
seq
.
sequence
)[
0
]
if
helical_start
==
None
or
helical_start
>
pos
:
new_marker
.
helical
=
None
else
:
new_marker
.
helical
=
str
(
pos
-
helical_start
+
2
)
new_marker
.
end
=
str
(
pos
+
l
)
new_marker
.
rank
=
"
species
"
new_marker
.
ptm
=
model_marker
[
0
]
new_marker
.
mass
=
mass
.
peptide_mass_with_PTM
(
seq
.
sequence
[
pos
:
pos
+
l
],
model_marker
[
0
])
# à modifier pour conserver la masse
new_marker
.
code
=
model_marker
[
1
]
new_marker
.
protein
=
model_marker
[
2
]
new_marker
.
comment
=
"
Homology :
"
+
seq
.
sequence
[
pos
-
1
]
+
"
- peptide -
"
+
seq
.
sequence
[
pos
+
l
]
+
"
,
"
if
d
==
0
:
new_marker
.
comment
=
new_marker
.
comment
+
"
exact match
"
else
:
new_marker
.
comment
=
new_marker
.
comment
+
str
(
d
)
+
"
mismatches with
"
+
marker_seq
if
new_cleavage
:
new_marker
.
comment
=
new_marker
.
comment
+
"
, new cleavage site
"
set_of_new_markers
.
add
(
new_marker
)
return
set_of_new_markers
def
find_markers_all_sequences
(
set_of_sequences
,
set_of_markers
):
# set_of_sequences: set of target fasta sequences[object defined in sequences.py]
# construction of dict_of_model_markers and dict_of_taxid
#dict_of_model_markers, key: peptide sequence [string], value:set of 3-uplets (PTM, code, gene_name)
#dict_of_taxid, key: (sequence,PTM), value: set of taxid
dict_of_model_markers
=
{}
dict_of_taxid
=
{}
marker_count
=
0
for
m
in
set_of_markers
:
if
m
.
sequence
==
None
or
len
(
m
.
sequence
)
==
0
:
continue
else
:
sequence
=
m
.
sequence
.
strip
()
if
len
(
m
.
code
)
==
0
or
m
.
code
==
None
:
marker_count
+=
1
peptide_name
=
"
M
"
+
str
(
marker_count
)
else
:
peptide_name
=
m
.
code
if
m
.
taxid
==
None
or
len
(
m
.
taxid
)
==
0
:
taxid
=
set
()
else
:
taxid
=
{
m
.
taxid
.
strip
()}
model_marker
=
(
ut
.
reduced
(
m
.
ptm
),
ut
.
reduced
(
peptide_name
),
ut
.
reduced_upper
(
m
.
protein
))
ut
.
update_dictoset
(
dict_of_model_markers
,
sequence
,
{
model_marker
})
#ut.update_dictoset(dict_of_taxid, (sequence,m.ptm.strip()), taxid)
set_of_new_markers
=
set
()
for
seq
in
set_of_sequences
:
set_of_digested_peptides
=
sequences
.
in_silico_digestion
({
seq
},
1
,
12
,
33
,
False
)
s
=
find_markers_single_sequence
(
seq
,
set_of_digested_peptides
,
dict_of_model_markers
,
set_of_markers
)
set_of_new_markers
.
update
(
s
)
list_of_new_markers
=
markers
.
sort_and_merge
(
set_of_new_markers
)
return
list_of_new_markers
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment