import pandas as pd
from Bio.SeqUtils import MeltingTemp
from seqfold import dg
import math
from joblib import Parallel, delayed
from tqdm import tqdm
[docs]def get_frac_g_or_c(feature_dict, guide_sequence):
"""Get gc content
Parameters
----------
feature_dict: dict
Feature dictionary
guide_sequence: str
Guide sequence
"""
g_count = guide_sequence.count('G')
c_count = guide_sequence.count('C')
gc_frac = (g_count + c_count)/len(guide_sequence)
feature_dict['GC content'] = gc_frac
[docs]def get_one_nt_frac(feature_dict, guide, nts):
"""Get fraction of single nt
Parameters
----------
feature_dict: dict
Feature dictionary
guide: str
Guide sequence
nts: list
List of nucleotides
"""
for nt in nts:
nt_frac = guide.count(nt)/len(guide)
feature_dict[nt] = nt_frac
[docs]def get_two_nt_frac(feature_dict, guide, nts):
"""Get fraction of two nts
Parameters
----------
feature_dict: dict
Feature dictionary
guide: str
Guide sequence
nts: list
List of nucleotides
"""
for nt1 in nts:
for nt2 in nts:
two_mer = nt1 + nt2
nts_counts = guide.count(two_mer)
nts_frac = nts_counts/(len(guide) - 1)
feature_dict[nt1 + nt2] = nts_frac
[docs]def get_three_nt_counts(feature_dict, guide, nts):
"""Get fraction of three nts
Parameters
----------
feature_dict: dict
Feature dictionary
guide: str
Guide sequence
nts: list
List of nucleotides
"""
for nt1 in nts:
for nt2 in nts:
for nt3 in nts:
k_mer = nt1 + nt2 + nt3
nts_counts = guide.count(k_mer)
nts_frac = nts_counts/(len(guide) - 2)
feature_dict[nt1 + nt2 + nt3] = nts_frac
[docs]def get_one_nt_pos(feature_dict, context_sequence, nts, context_order):
"""One hot encode single nucleotide
Parameters
----------
feature_dict: dict
Feature dictionary
context_sequence: str
Context sequence
nts: list
List of nucleotides
context_order: list
Position of context
"""
for i in range(len(context_order)):
curr_nt = context_sequence[i]
for nt in nts:
key = context_order[i] + nt
if curr_nt == nt:
feature_dict[key] = 1
else:
feature_dict[key] = 0
[docs]def get_two_nt_pos(feature_dict, context_sequence, nts, context_order):
"""One hot encode two nucleotides
Parameters
----------
feature_dict: dict
Feature dictionary
context_sequence: str
Context sequence
nts: list
List of nucleotides
context_order: list
Position of context
"""
for i in range(len(context_order) - 1):
curr_nts = context_sequence[i:i+2]
for nt1 in nts:
for nt2 in nts:
match_nts = nt1+nt2
key = context_order[i] + match_nts
if curr_nts == match_nts:
feature_dict[key] = 1
else:
feature_dict[key] = 0
[docs]def get_three_nt_pos(feature_dict, context_sequence, nts, context_order):
"""One hot encode three nucleotides
Parameters
----------
feature_dict: dict
Feature dictionary
context_sequence: str
Context sequence
nts: list
List of nucleotides
context_order: list
Position of context
"""
for i in range(len(context_order) - 2):
curr_nts = context_sequence[i:i+3]
for nt1 in nts:
for nt2 in nts:
for nt3 in nts:
match_nts = nt1+nt2+nt3
key = context_order[i] + match_nts
if curr_nts == match_nts:
feature_dict[key] = 1
else:
feature_dict[key] = 0
[docs]def get_thermo(feature_dict, guide_sequence, sections):
"""Use Biopython to get thermo info. from context and guides
Parameters
----------
feature_dict: dict
Feature dictionary
guide_sequence: str
Guide sequence
sections: iterable of int
Section of guide sequence
"""
feature_dict['Tm DD guide'] = MeltingTemp.Tm_NN(guide_sequence)
feature_dict['Tm RD guide'] = MeltingTemp.Tm_NN(guide_sequence, nn_table=MeltingTemp.R_DNA_NN1)
dg_rr = dg(guide_sequence.replace('T', 'U'))
if dg_rr == -math.inf:
feature_dict['DG RR guide'] = -20
elif dg_rr == math.inf:
feature_dict['DG RR guide'] = 8
elif dg_rr == 1600:
feature_dict['DG RR guide'] = 7
else:
feature_dict['DG RR guide'] = dg_rr
section_start = 0
for s in sections:
section_end = s
feature_name = 'Tm RD ' + str(int(section_start + 1)) + ' to ' + str(int(section_end))
feature_dict[feature_name] = MeltingTemp.Tm_NN(guide_sequence[section_start:section_end],
nn_table=MeltingTemp.R_DNA_NN1)
section_start = section_end
[docs]def get_pam_interaction(feature_dict, context_sequence, nts, context_order, pam_ends):
"""One hot encode interactions on either side of the PAM sequence
Parameters
----------
feature_dict: dict
Feature dictionary
context_sequence: str
Context sequence
nts: list
List of nucleotides
context_order: list
Position of context
pam_ends: tuple
Location on either side of the pam, zero-indexed
"""
l_pam_index = pam_ends[0]
l_pam_context = context_order[l_pam_index]
l_pam_nt = context_sequence[l_pam_index]
r_pam_index = pam_ends[1]
r_pam_context = context_order[r_pam_index]
r_pam_nt = context_sequence[r_pam_index]
for nt1 in nts:
for nt2 in nts:
key = l_pam_context + nt1 + '_' + r_pam_context + nt2
if (l_pam_nt == nt1) & (r_pam_nt == nt2):
feature_dict[key] = 1
else:
feature_dict[key] = 0
[docs]def get_polyn(feature_dict, guide_sequence, nts):
"""Get max run for each nucleotide
Parameters
----------
feature_dict: dict
Feature dictionary
guide_sequence: str
Guide sequence
nts: list
List of nucleotides
"""
for nt in nts:
lng = 0
cnt = 0
for c in guide_sequence:
if c == nt:
cnt += 1
else:
lng = max(lng, cnt)
cnt = 0
lng = max(lng, cnt)
feature_dict['Poly' + nt] = lng
[docs]def get_context_order(k, pam_start, pam_length, guide_start, guide_length):
"""Get named order of context sequence
Parameters
----------
k: int
length of kmer
pam_start: int
Start of PAM, one-indexed
pam_length: int
PAM length
guide_start: int
Start of guide, one-indexed
guide_length: int
Length of guide
Returns
-------
list
Ordering for context sequence
"""
pam_order = ['P' + str(x) for x in range(1, pam_length + 1)]
guide_order = [str(x) for x in range(1, guide_length + 1)]
if pam_start == min(pam_start, guide_start):
second_component = pam_order
third_component = guide_order
else:
second_component = guide_order
third_component = pam_order
first_ord = ['-' + str(x) for x in reversed(range(1, min(pam_start, guide_start) + 1))]
fourth_ord = ['+' + str(x) for x in range(1, k - (len(first_ord) + len(second_component) +
len(third_component)) + 1)]
context_order = first_ord + second_component + third_component + fourth_ord
return context_order
[docs]def get_guide_sequence(context, guide_start, guide_length):
return context[guide_start:(guide_start + guide_length)]
[docs]def build_feature_dict(context, guide_start, guide_length, features, nts, context_order,
pam_interaction, guide_sections):
curr_dict = {}
guide_sequence = get_guide_sequence(context, guide_start, guide_length)
if 'GC content' in features:
get_frac_g_or_c(curr_dict, guide_sequence)
if 'Pos. Ind. 1mer' in features:
get_one_nt_frac(curr_dict, guide_sequence, nts)
if 'Pos. Ind. 2mer' in features:
get_two_nt_frac(curr_dict, guide_sequence, nts)
if 'Pos. Ind. 3mer' in features:
get_three_nt_counts(curr_dict, guide_sequence, nts)
if 'Pos. Dep. 1mer' in features:
get_one_nt_pos(curr_dict, context, nts, context_order)
if 'Pos. Dep. 2mer' in features:
get_two_nt_pos(curr_dict, context, nts, context_order)
if 'Pos. Dep. 3mer' in features:
get_three_nt_pos(curr_dict, context, nts, context_order)
if 'PAM interaction' in features:
get_pam_interaction(curr_dict, context, nts, context_order, pam_interaction)
if 'Tm' in features:
get_thermo(curr_dict, guide_sequence, guide_sections)
if 'PolyN' in features:
get_polyn(curr_dict, guide_sequence, nts)
return curr_dict
[docs]def featurize_guides(kmers, features=None,
pam_start=24, pam_length=3,
guide_start=4, guide_length=20,
pam_interaction=(24, 27),
guide_sections=(10, 20),
n_jobs=1):
"""Featurize a list of guide sequences
Parameters
----------
kmers: list of str
Context sequences
features: list of str, optional
List of features. Will default to rule set 2 features
guide_start: int
Position of guide start, zero-indexed
guide_length: int
Length of guide
pam_start: int
Position of pam start, zero-indexed
pam_length: int
Length of PAM
pam_interaction: tuple
Location on either side of the pam, zero-indexed
Returns
-------
DataFrame
Nucleotide features
"""
if features is None:
features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer',
'Pos. Dep. 1mer', 'Pos. Dep. 2mer',
'PAM interaction', 'GC content', 'Tm',
'PolyN']
possible_feats = {'Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer',
'Pos. Dep. 1mer', 'Pos. Dep. 2mer',
'Pos. Dep. 3mer', 'GC content', 'Tm',
'PAM interaction', 'PolyN'}
if not set(features).issubset(possible_feats):
diff = set(features) - possible_feats
assert ValueError(str(diff) + ' are not currently supported as features')
k = len(kmers[0])
context_order = get_context_order(k, pam_start, pam_length, guide_start, guide_length)
nts = ['A', 'C', 'T', 'G']
feature_dict_list = Parallel(n_jobs=n_jobs)(delayed(build_feature_dict)
(context, guide_start, guide_length, features, nts, context_order,
pam_interaction, guide_sections) for context in tqdm(kmers))
feature_matrix = pd.DataFrame(feature_dict_list)
feature_matrix.index = kmers
return feature_matrix