Source code for sglearn.featurization

import pandas as pd
from Bio.SeqUtils import MeltingTemp
from seqfold import dg
import math
from joblib import Parallel, delayed
from tqdm import tqdm

[docs]def get_frac_g_or_c(feature_dict, guide_sequence): """Get gc content Parameters ---------- feature_dict: dict Feature dictionary guide_sequence: str Guide sequence """ g_count = guide_sequence.count('G') c_count = guide_sequence.count('C') gc_frac = (g_count + c_count)/len(guide_sequence) feature_dict['GC content'] = gc_frac
[docs]def get_one_nt_frac(feature_dict, guide, nts): """Get fraction of single nt Parameters ---------- feature_dict: dict Feature dictionary guide: str Guide sequence nts: list List of nucleotides """ for nt in nts: nt_frac = guide.count(nt)/len(guide) feature_dict[nt] = nt_frac
[docs]def get_two_nt_frac(feature_dict, guide, nts): """Get fraction of two nts Parameters ---------- feature_dict: dict Feature dictionary guide: str Guide sequence nts: list List of nucleotides """ for nt1 in nts: for nt2 in nts: two_mer = nt1 + nt2 nts_counts = guide.count(two_mer) nts_frac = nts_counts/(len(guide) - 1) feature_dict[nt1 + nt2] = nts_frac
[docs]def get_three_nt_counts(feature_dict, guide, nts): """Get fraction of three nts Parameters ---------- feature_dict: dict Feature dictionary guide: str Guide sequence nts: list List of nucleotides """ for nt1 in nts: for nt2 in nts: for nt3 in nts: k_mer = nt1 + nt2 + nt3 nts_counts = guide.count(k_mer) nts_frac = nts_counts/(len(guide) - 2) feature_dict[nt1 + nt2 + nt3] = nts_frac
[docs]def get_one_nt_pos(feature_dict, context_sequence, nts, context_order): """One hot encode single nucleotide Parameters ---------- feature_dict: dict Feature dictionary context_sequence: str Context sequence nts: list List of nucleotides context_order: list Position of context """ for i in range(len(context_order)): curr_nt = context_sequence[i] for nt in nts: key = context_order[i] + nt if curr_nt == nt: feature_dict[key] = 1 else: feature_dict[key] = 0
[docs]def get_two_nt_pos(feature_dict, context_sequence, nts, context_order): """One hot encode two nucleotides Parameters ---------- feature_dict: dict Feature dictionary context_sequence: str Context sequence nts: list List of nucleotides context_order: list Position of context """ for i in range(len(context_order) - 1): curr_nts = context_sequence[i:i+2] for nt1 in nts: for nt2 in nts: match_nts = nt1+nt2 key = context_order[i] + match_nts if curr_nts == match_nts: feature_dict[key] = 1 else: feature_dict[key] = 0
[docs]def get_three_nt_pos(feature_dict, context_sequence, nts, context_order): """One hot encode three nucleotides Parameters ---------- feature_dict: dict Feature dictionary context_sequence: str Context sequence nts: list List of nucleotides context_order: list Position of context """ for i in range(len(context_order) - 2): curr_nts = context_sequence[i:i+3] for nt1 in nts: for nt2 in nts: for nt3 in nts: match_nts = nt1+nt2+nt3 key = context_order[i] + match_nts if curr_nts == match_nts: feature_dict[key] = 1 else: feature_dict[key] = 0
[docs]def get_thermo(feature_dict, guide_sequence, sections): """Use Biopython to get thermo info. from context and guides Parameters ---------- feature_dict: dict Feature dictionary guide_sequence: str Guide sequence sections: iterable of int Section of guide sequence """ feature_dict['Tm DD guide'] = MeltingTemp.Tm_NN(guide_sequence) feature_dict['Tm RD guide'] = MeltingTemp.Tm_NN(guide_sequence, nn_table=MeltingTemp.R_DNA_NN1) dg_rr = dg(guide_sequence.replace('T', 'U')) if dg_rr == -math.inf: feature_dict['DG RR guide'] = -20 elif dg_rr == math.inf: feature_dict['DG RR guide'] = 8 elif dg_rr == 1600: feature_dict['DG RR guide'] = 7 else: feature_dict['DG RR guide'] = dg_rr section_start = 0 for s in sections: section_end = s feature_name = 'Tm RD ' + str(int(section_start + 1)) + ' to ' + str(int(section_end)) feature_dict[feature_name] = MeltingTemp.Tm_NN(guide_sequence[section_start:section_end], nn_table=MeltingTemp.R_DNA_NN1) section_start = section_end
[docs]def get_pam_interaction(feature_dict, context_sequence, nts, context_order, pam_ends): """One hot encode interactions on either side of the PAM sequence Parameters ---------- feature_dict: dict Feature dictionary context_sequence: str Context sequence nts: list List of nucleotides context_order: list Position of context pam_ends: tuple Location on either side of the pam, zero-indexed """ l_pam_index = pam_ends[0] l_pam_context = context_order[l_pam_index] l_pam_nt = context_sequence[l_pam_index] r_pam_index = pam_ends[1] r_pam_context = context_order[r_pam_index] r_pam_nt = context_sequence[r_pam_index] for nt1 in nts: for nt2 in nts: key = l_pam_context + nt1 + '_' + r_pam_context + nt2 if (l_pam_nt == nt1) & (r_pam_nt == nt2): feature_dict[key] = 1 else: feature_dict[key] = 0
[docs]def get_polyn(feature_dict, guide_sequence, nts): """Get max run for each nucleotide Parameters ---------- feature_dict: dict Feature dictionary guide_sequence: str Guide sequence nts: list List of nucleotides """ for nt in nts: lng = 0 cnt = 0 for c in guide_sequence: if c == nt: cnt += 1 else: lng = max(lng, cnt) cnt = 0 lng = max(lng, cnt) feature_dict['Poly' + nt] = lng
[docs]def get_context_order(k, pam_start, pam_length, guide_start, guide_length): """Get named order of context sequence Parameters ---------- k: int length of kmer pam_start: int Start of PAM, one-indexed pam_length: int PAM length guide_start: int Start of guide, one-indexed guide_length: int Length of guide Returns ------- list Ordering for context sequence """ pam_order = ['P' + str(x) for x in range(1, pam_length + 1)] guide_order = [str(x) for x in range(1, guide_length + 1)] if pam_start == min(pam_start, guide_start): second_component = pam_order third_component = guide_order else: second_component = guide_order third_component = pam_order first_ord = ['-' + str(x) for x in reversed(range(1, min(pam_start, guide_start) + 1))] fourth_ord = ['+' + str(x) for x in range(1, k - (len(first_ord) + len(second_component) + len(third_component)) + 1)] context_order = first_ord + second_component + third_component + fourth_ord return context_order
[docs]def get_guide_sequence(context, guide_start, guide_length): return context[guide_start:(guide_start + guide_length)]
[docs]def build_feature_dict(context, guide_start, guide_length, features, nts, context_order, pam_interaction, guide_sections): curr_dict = {} guide_sequence = get_guide_sequence(context, guide_start, guide_length) if 'GC content' in features: get_frac_g_or_c(curr_dict, guide_sequence) if 'Pos. Ind. 1mer' in features: get_one_nt_frac(curr_dict, guide_sequence, nts) if 'Pos. Ind. 2mer' in features: get_two_nt_frac(curr_dict, guide_sequence, nts) if 'Pos. Ind. 3mer' in features: get_three_nt_counts(curr_dict, guide_sequence, nts) if 'Pos. Dep. 1mer' in features: get_one_nt_pos(curr_dict, context, nts, context_order) if 'Pos. Dep. 2mer' in features: get_two_nt_pos(curr_dict, context, nts, context_order) if 'Pos. Dep. 3mer' in features: get_three_nt_pos(curr_dict, context, nts, context_order) if 'PAM interaction' in features: get_pam_interaction(curr_dict, context, nts, context_order, pam_interaction) if 'Tm' in features: get_thermo(curr_dict, guide_sequence, guide_sections) if 'PolyN' in features: get_polyn(curr_dict, guide_sequence, nts) return curr_dict
[docs]def featurize_guides(kmers, features=None, pam_start=24, pam_length=3, guide_start=4, guide_length=20, pam_interaction=(24, 27), guide_sections=(10, 20), n_jobs=1): """Featurize a list of guide sequences Parameters ---------- kmers: list of str Context sequences features: list of str, optional List of features. Will default to rule set 2 features guide_start: int Position of guide start, zero-indexed guide_length: int Length of guide pam_start: int Position of pam start, zero-indexed pam_length: int Length of PAM pam_interaction: tuple Location on either side of the pam, zero-indexed Returns ------- DataFrame Nucleotide features """ if features is None: features = ['Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'PAM interaction', 'GC content', 'Tm', 'PolyN'] possible_feats = {'Pos. Ind. 1mer', 'Pos. Ind. 2mer', 'Pos. Ind. 3mer', 'Pos. Dep. 1mer', 'Pos. Dep. 2mer', 'Pos. Dep. 3mer', 'GC content', 'Tm', 'PAM interaction', 'PolyN'} if not set(features).issubset(possible_feats): diff = set(features) - possible_feats assert ValueError(str(diff) + ' are not currently supported as features') k = len(kmers[0]) context_order = get_context_order(k, pam_start, pam_length, guide_start, guide_length) nts = ['A', 'C', 'T', 'G'] feature_dict_list = Parallel(n_jobs=n_jobs)(delayed(build_feature_dict) (context, guide_start, guide_length, features, nts, context_order, pam_interaction, guide_sections) for context in tqdm(kmers)) feature_matrix = pd.DataFrame(feature_dict_list) feature_matrix.index = kmers return feature_matrix