Peptide Embedding for Unsupervised clustering

Posted on * • 10 minutes • 2076 words

How the peptides were encoded in an interesting paper titled: Unsupervised machine learning leads to an abiotic picomolar peptide ligand.

Different emebdeings that were used:

one-hot encoding
extended connectivity Fingerprints (ECFP_6) from RDKit represented each amino acid as a vector, where each index indicates the presence (1) or absence (0) of a specific molecular substructure
physicochemical encoding based on 12 physicochemical properties.
N-gramsencodingrepresentedtheentirepeptidebyitsungappedmotifs, irrespective of position.
Latent embeddings of the entire peptide from the protein language model ESM-2.26. This is not included in this notebook.

Read data for target-specific binders, nonspecific and nonbinding peptides form affinity- selection mass-spectrometry (AS-MS)

import pandas as pd
# Inputs
file = './All Data Randomized C-L.csv'
df = pd.read_csv(file)
df

	Peptide	Label	ALC
0	VWRDAEDYATFPK	Specific	NaN
1	TPDWDGPDYAAHK	Specific	NaN
2	EMDAPDYASWEPK	Specific	NaN
3	SVEDDAPDYADFK	Specific	NaN
4	MMDMDLQDYAGLK	Specific	NaN
...	...	...	...
4099	LLKYTDRHDYAWK	Specific	NaN
4100	DTDWPDYSSFLFK	Specific	NaN
4101	DSWEDYSSNTAAK	Unknown	NaN
4102	PQDLSEYAHSNNK	Nonspecific	NaN
4103	FLYDVHDYAFEHK	Specific	NaN

4104 rows × 3 columns

 # Isolates the X12 variable region from all peptides. This is specific to AS-MS
seq_list = [seq[0:12] for seq in df['Peptide']]

seq_list[-5:]

['LLKYTDRHDYAW',
 'DTDWPDYSSFLF',
 'DSWEDYSSNTAA',
 'PQDLSEYAHSNN',
 'FLYDVHDYAFEH']

import math

print(f'{len(seq_list)} peptides', flush=True) 
print(f'If using UMAP, n_neighbors is recommended to be {math.floor(len(seq_list)*0.05)} with n_bors = {math.ceil(math.log2(len(seq_list)*0.05))}')

4104 peptides
If using UMAP, n_neighbors is recommended to be 205 with n_bors = 8

One-hot encoding

# Fingerprint encoding using RDKit
Std_All_AA_Smiles = {
'A' : 'CNC(=O)[C@H](C)NC(C)=O',
'C' : 'CNC(=O)[C@H](CS)NC(C)=O', # note that Cys is not used in AS-MS libraries for ths work, but is included for other use cases
'D' : 'CNC(=O)[C@H](CC(=O)O)NC(C)=O',
'E' : 'CNC(=O)[C@H](CCC(=O)O)NC(C)=O',
'F' : 'CNC(=O)[C@H](Cc1ccccc1)NC(C)=O',
'G' : 'CNC(=O)CNC(C)=O',
'H' : 'CNC(=O)[C@H](Cc1c[nH]cn1)NC(C)=O',
'I' : 'CC[C@H](C)[C@H](NC(C)=O)C(=O)NC', # note that Ile is not used in AS-MS libraries for ths work, but is included for other use cases
'K' : 'CNC(=O)[C@H](CCCCN)NC(C)=O',
'L' : 'CNC(=O)[C@H](CC(C)C)NC(C)=O',
'M' : 'CNC(=O)[C@H](CCSC)NC(C)=O',
'N' : 'CNC(=O)[C@H](CC(N)=O)NC(C)=O',
'P' : 'CNC(=O)[C@@H]1CCCN1C(C)=O',
'Q' : 'CNC(=O)[C@H](CCC(N)=O)NC(C)=O',
'R' : 'CNC(=O)[C@H](CCCNC(=N)N)NC(C)=O',
'S' : 'CNC(=O)[C@H](CO)NC(C)=O',
'T' : 'CNC(=O)[C@@H](NC(C)=O)[C@@H](C)O',
'V' : 'CNC(=O)[C@@H](NC(C)=O)C(C)C',
'W' : 'CNC(=O)[C@H](Cc1c[nH]c2ccccc12)NC(C)=O',
'Y' : 'CNC(=O)[C@H](Cc1ccc(O)cc1)NC(C)=O',
}

from sklearn import preprocessing
import numpy as np

# OneHot encoding function
enc_one = preprocessing.OneHotEncoder(handle_unknown='ignore')
enc_one.fit(np.array(list(Std_All_AA_Smiles.keys())).reshape(-1, 1))
def one_hot(sequence):
    return enc_one.transform(np.array(list(sequence)).reshape(-1,1)).toarray().reshape(-1)

seq_list[0]

'VWRDAEDYATFP'

one_hot(seq_list[0])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0.])

# DO it for all seqeunces
seq_rep = {}
seq_rep['OneHot'] = [one_hot(seq) for seq in seq_list]

# List to keep representation 
seq_rep_list = ['OneHot', 'PhysProperties', 'Fingerprint', 'NGrams']

Fingerprint Encoding

from rdkit.Chem import AllChem as Chem

class Fingerprint_Generation:
    def __init__(self, smiles,radius,nbits):
        self.lookupfps = {}
        for key, value in Std_All_AA_Smiles.items():
            mol = Chem.MolFromSmiles(value)
            fp = np.array(Chem.GetMorganFingerprintAsBitVect(mol,radius,nbits))
            self.lookupfps[key] = fp
        self.lookupfps[' '] = np.zeros(self.lookupfps['A'].shape)
        self.lookupfps['x'] = np.ones(self.lookupfps['A'].shape)
    def seq(self, seq):
        fp = np.asarray([self.lookupfps[seq[i]] for i in range(len(seq))])
        return fp

def fingerprint(sequence,NBITS,fp):
    fp_seq = fp.seq(sequence)
    return fp_seq.reshape(-1)

# Parameters
NBITS = 256 # For Fingerprint encoding, 256 was optimal 
RADIUS = 3  # For Fingerprint encoding, ECFP_6

fp = Fingerprint_Generation(Std_All_AA_Smiles.keys(),RADIUS,NBITS)

seq_rep['Fingerprint'] = [fingerprint(seq,NBITS,fp) for seq in seq_list]

Using Physical Properties for Encoding

based on 12 physicochemical properties of its composite amino acids: hydrophilicity, flexibility, accessibility, turns scale, exposed surface, polarity, antigenic propensity, hydrophobicity, net charge index of the side chains, polarizability, solvent-accessible surface area, and side-chain volume.

For details See; Residue–Residue Interaction Prediction via Stacked Meta-Learning https://doi.org/10.3390/ijms22126393

# Extended physical properties database and encoding
extended_prop_aa = {
'A':[0.620140363440966,0.17686056841772,-0.188749006724763,-0.224734173518473,-0.0836274309924442,-0.553429077935438,-1.32955152740811,-1.38172110737113,-1.02627398746377,-0.509282137090254,-0.585795803359558,-0.781110425203712,-0.520029443227509,0.213921592667873],
'C':[0.290065653867549,0.066322713156645,-0.440414349024447,-0.226412219190666,-1.04998885579402,-0.476729477285148,-0.489295456136705,-0.77494270388318,-0.578373331393242,-0.427542694668279,-1.63430191054009,-1.21748496442366,0.881041618691234,2.14004615278222],
'D':[-0.900203753382047,1.42435922064985,1.57290838937303,-0.225916432969336,1.73759217728745,1.55580993994754,-0.724977037103076,-0.501892422313604,-0.698861227178295,0.712268419104827,0.73623363612894,0.746200462066116,1.58587491364422,-0.881976863948911],
'E':[-0.740167530558572,1.07695453268647,1.57290838937303,-0.224734173518473,1.47741794753318,2.03310948183212,-0.253613875170334,0.0940506525406323,-0.122614769075867,0.794007861526803,1.0097569684369,0.964387731676091,-0.623175779074287,-0.964999474298668],
'F':[1.1902694072496,-1.60753623793963,-1.1954103759235,-0.22355191406761,-1.16149209711728,-0.535290658862734,1.1707226358873,0.887196708528451,1.27871184494595,-0.90435610879647,-0.904906357718851,-0.999297694813687,0.322332299521183,0.363362291297433],
'G':[0.480108668470425,0.745340966903249,0.062916335574921,-0.218174540436266,0.250882292977333,-0.553429077935438,-1.80091468934086,-2.03184082539393,-1.7465820600918,0.912075945025212,-0.631383025410886,-0.999297694813687,-0.30514124354672,-0.837698138429041],
'H':[-0.400090557058688,0.17686056841772,-0.188749006724763,-0.225420646748007,0.771230752485876,-0.370490165573597,0.555901120322853,0.447282365999688,0.322666584912373,-0.304933531035315,1.0097569684369,1.00802518559809,0.262163603610562,0.440850060957206],
'I':[1.38031242185247,-1.41804277177779,-0.843078896703942,-0.224162112493862,-1.16149209711728,-0.545655469761422,0.105032008908926,-0.0186367652499869,0.702465386843518,-1.24493711888804,-1.26960413412947,-0.868385333047702,-1.14750298629541,0.700987573386442],
'K':[-1.50033958897008,0.745340966903249,1.57290838937303,-0.224314662100425,1.10574047645565,2.01186161948981,0.443183842469371,0.952208680330731,0.872720022191963,1.37526611874974,1.60239085510416,2.27351134933594,-0.743513170895529,-0.527747059789951],
'L':[1.06023997620552,-1.60753623793963,-0.843078896703942,-0.22301799044464,-1.27299533844054,-0.53010825341339,0.105032008908926,0.243578187685877,0.702465386843518,-0.972472310814783,-0.996080801821506,-0.737472971281717,-0.700535530959371,1.24340196100485],
'M':[0.6401448912939,-0.817980128931955,-0.591413554404258,-0.224886723125036,-0.975653361578516,-0.47932068000982,0.463677892988186,0.466785957540372,0.718181199337221,-0.768123704759844,-0.631383025410886,-0.562923155593736,-1.10452534635925,-1.10337049154826],
'N':[-0.780176586264441,0.950625555245245,1.06957770477366,-0.224810448321755,1.21724371777891,-0.378263773747613,-0.42781330458026,-0.354531952895101,-0.209051737791231,0.871206223814225,0.872995302282922,0.702563008144121,2.74627119192048,-1.38011252604745],
'P':[0.120027167117606,0.17686056841772,0.062916335574921,-0.215848158936181,-0.120795178100197,-0.553429077935438,-0.458554380358482,-0.759773243795982,-0.649094487614904,0.303571206994949,-0.585795803359558,-0.781110425203712,-0.520029443227509,0.213921592667873],
'Q':[-0.850192433749711,0.792714333443709,0.163582472494795,-0.223132402649562,0.808398499593628,-0.370490165573597,0.0435498573524813,0.245745253412619,0.367194720311197,0.603282495875526,1.0097569684369,1.00802518559809,0.262163603610562,-0.0572856011413323],
'R':[-2.5305727733962,0.508474134200945,1.57290838937303,-0.223323089657766,0.808398499593628,2.1414217557234,1.18096966114671,1.60666252980702,1.00368512630615,1.33439639753875,1.51121641100151,1.48803717874003,-0.820872922780613,-0.843232979119025],
'S':[-0.180040750676409,0.871669944344477,0.213915540954732,-0.224810448321755,0.325217787192839,-0.466882906931394,-1.16559912325759,-1.12817441734224,-0.979126549982658,1.42975908036439,0.143599749461682,-0.0392737085297955,1.31941354604004,-0.073890123211283],
'T':[-0.050011319632336,0.666385356002481,-0.138415938264826,-0.224886723125036,0.102211304546321,-0.467401147476329,-0.694235961324853,-0.636250497371649,-0.40288009188023,0.603282495875526,0.371535859718319,-0.0392737085297955,0.399692051406267,-0.64397871427961],
'V':[1.08024450405846,-0.739024518031187,-0.692079691324131,-0.222827303436436,-0.90131786736301,-0.546691950851291,-0.366331153023816,-0.376202610162528,0.12621892874109,-0.727253983548856,-1.17842969002682,-0.824747879125707,-0.545816027189203,1.97953577277269],
'W':[0.810183378043843,-1.73386521538086,-1.64840799206293,-0.22355191406761,-1.08715660290178,-0.444598563499216,2.39011864175678,1.82987029966151,2.06450246963108,-1.64909325086336,-0.494621359256903,-0.693835517359722,-0.666153419010445,-0.73253616531935],
'Y':[0.260058862088147,-0.454784318788423,-1.09474423900363,4.2485160771289,-0.789814626039751,-0.469992350201001,1.25269883796256,1.19058591027243,-1.74595342759205,-1.43112140440476,0.645059192026285,0.35346337676816,-0.0816575158786997,0.750801139596297],
}

def ext_prop_enc(sequence):
    temp_list = []
    for letter in sequence:
        temp_list.append(letter)
    
    new_list = []
    for i,letter in enumerate(temp_list):
        new_list.append(extended_prop_aa[temp_list[i]])
    return np.array(new_list).reshape(-1)

seq_rep['PhysProperties'] = [ext_prop_enc(seq) for seq in seq_list]

seq_rep['PhysProperties'][0]

array([ 1.0802445 , -0.73902452, -0.69207969, -0.2228273 , -0.90131787,
       -0.54669195, -0.36633115, -0.37620261,  0.12621893, -0.72725398,
       -1.17842969, -0.82474788, -0.54581603,  1.97953577,  0.81018338,
       -1.73386522, -1.64840799, -0.22355191, -1.0871566 , -0.44459856,
        2.39011864,  1.8298703 ,  2.06450247, -1.64909325, -0.49462136,
       -0.69383552, -0.66615342, -0.73253617, -2.53057277,  0.50847413,
        1.57290839, -0.22332309,  0.8083985 ,  2.14142176,  1.18096966,
        1.60666253,  1.00368513,  1.3343964 ,  1.51121641,  1.48803718,
       -0.82087292, -0.84323298, -0.90020375,  1.42435922,  1.57290839,
       -0.22591643,  1.73759218,  1.55580994, -0.72497704, -0.50189242,
       -0.69886123,  0.71226842,  0.73623364,  0.74620046,  1.58587491,
       -0.88197686,  0.62014036,  0.17686057, -0.18874901, -0.22473417,
       -0.08362743, -0.55342908, -1.32955153, -1.38172111, -1.02627399,
       -0.50928214, -0.5857958 , -0.78111043, -0.52002944,  0.21392159,
       -0.74016753,  1.07695453,  1.57290839, -0.22473417,  1.47741795,
        2.03310948, -0.25361388,  0.09405065, -0.12261477,  0.79400786,
        1.00975697,  0.96438773, -0.62317578, -0.96499947, -0.90020375,
        1.42435922,  1.57290839, -0.22591643,  1.73759218,  1.55580994,
       -0.72497704, -0.50189242, -0.69886123,  0.71226842,  0.73623364,
        0.74620046,  1.58587491, -0.88197686,  0.26005886, -0.45478432,
       -1.09474424,  4.24851608, -0.78981463, -0.46999235,  1.25269884,
        1.19058591, -1.74595343, -1.4311214 ,  0.64505919,  0.35346338,
       -0.08165752,  0.75080114,  0.62014036,  0.17686057, -0.18874901,
       -0.22473417, -0.08362743, -0.55342908, -1.32955153, -1.38172111,
       -1.02627399, -0.50928214, -0.5857958 , -0.78111043, -0.52002944,
        0.21392159, -0.05001132,  0.66638536, -0.13841594, -0.22488672,
        0.1022113 , -0.46740115, -0.69423596, -0.6362505 , -0.40288009,
        0.6032825 ,  0.37153586, -0.03927371,  0.39969205, -0.64397871,
        1.19026941, -1.60753624, -1.19541038, -0.22355191, -1.1614921 ,
       -0.53529066,  1.17072264,  0.88719671,  1.27871184, -0.90435611,
       -0.90490636, -0.99929769,  0.3223323 ,  0.36336229,  0.12002717,
        0.17686057,  0.06291634, -0.21584816, -0.12079518, -0.55342908,
       -0.45855438, -0.75977324, -0.64909449,  0.30357121, -0.5857958 ,
       -0.78111043, -0.52002944,  0.21392159])

NGRAM representation

N-grams encoding was completed by pre-calculating the observed n-mers in the dataset up to a maximum n-mer length of the full peptide length (12 residues). The entire peptide was represented at once as a 138,622 length vector, where each index of the vector describes an n-mer motif that is either present (1) or absent (0) in the peptide.

# minlengthfinder will be used to identify the minimum peptide length from the input dataset
# which is 12
def minlengthfinder(lst):
    lengths = []
    for x in lst: lengths.append(len(x))
    the_min = min(lengths)
    return the_min

NGRAM_MAX = minlengthfinder(seq_list)
print(f'NGRAM_MAX is {NGRAM_MAX}, now calculating n_mers (the index of the N-grams vector)',flush=True)

NGRAM_MAX is 12, now calculating n_mers (the index of the N-grams vector)

def cal_n_mers(seq_list, n): # enumerate and index of all the unique n_mers in the input dataset
    theo_max = len(Std_All_AA_Smiles)**n

    n_mers = set()
    for seq in seq_list:
        all_mers = [seq[i:i+n] for i in range(len(seq)-n+1)]
        n_mers.update(all_mers)
        
        if n_mers == theo_max:
            print('NGrams pre-calculation reached the theoretical maximum and stopped')
            break
    return n_mers

# n-mers are calculated here, if additional peptides are added, the same n-mers should be used
n_mers = [list(cal_n_mers(seq_list, n)) for n in range(1, NGRAM_MAX+1)]

def n_grams(seq,n_mers):
    n_gram = []
    for n_mer in n_mers:
        n = len(n_mer[0])
        seq_mers = [seq[i:i+n] for i in range(len(seq)-n+1)]
        for mer in n_mer:
            n_gram.append(seq_mers.count(mer))
    return np.array(n_gram)

def enc_ngrams(seqs,n_mers):
    return [n_grams(seq,n_mers) for seq in seqs]

seq_rep['NGrams'] = enc_ngrams(seq_list,n_mers)

Dim Reduction with PCA

import matplotlib.pyplot as plt
from sklearn import decomposition

data = {}
SEED = 108

pca_fn_dict = {}
pca_embeddding_dict = {}
for rep in seq_rep:
    ref_feature = seq_rep[rep]
    reducer_scaled_dict = {}
    embedding_scaled_dict = {}

    pca = decomposition.PCA(n_components=2, random_state=SEED)
    embedding = pca.fit_transform(ref_feature)
    pca_fn_dict[rep] = pca
    pca_embeddding_dict[rep] = embedding
data['PCA'] = pca_embeddding_dict

Dim Reduction with UMAP

#!pip install umap-learn
from sklearn import manifold
import umap.umap_ as umap

# Dictionaries will hold the data, format of umap_all[min_dist][representation][n_neighbors]
umap_all = {}                        
umap_fn_dict = {}
umap_embeddding_dict = {}
n_bors = [8]  #to determine if UMAP embeddings are stable
min_dict = {'OneHot': 0.4, 'Fingerprint': 0.1, 'PhysProperties': 0.1, 'NGrams': 0.1} 
metric_dict = {'OneHot': 'rogerstanimoto', 'Fingerprint': 'rogerstanimoto', 'PhysProperties': 'euclidean', 'NGrams': 'rogerstanimoto'}

for rep in seq_rep:              # iterate over representation
    min_num = min_dict[rep]
    print(f'min_num is {min_num} for {rep}')
    ref_feature = seq_rep[rep]

    reducer_scaled_dict = {}
    embedding_scaled_dict = {}

    for n_neighbors in n_bors:   # iterate over n_neighbors if n_bors is constructed with multiple numbers
        print(f'n_neighbors is {n_neighbors}',flush=True)
        reducer_scaled_dict[2**n_neighbors] = umap.UMAP(
            n_components=2, n_neighbors=2**n_neighbors, min_dist=min_dict[rep], random_state=SEED, metric=metric_dict[rep])
        embedding_scaled_dict[2**n_neighbors] = reducer_scaled_dict[2**n_neighbors].fit_transform(ref_feature)

    umap_fn_dict[rep] = reducer_scaled_dict            
    umap_embeddding_dict[rep] = embedding_scaled_dict
    umap_all[min_num] = umap_embeddding_dict
data['UMAP'] = umap_all

min_num is 0.4 for OneHot
n_neighbors is 8


/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1879: UserWarning: gradient function is not yet implemented for rogerstanimoto distance metric; inverse_transform will be unavailable
  warn(
/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1943: UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


min_num is 0.1 for Fingerprint
n_neighbors is 8


/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1879: UserWarning: gradient function is not yet implemented for rogerstanimoto distance metric; inverse_transform will be unavailable
  warn(
/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1943: UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


min_num is 0.1 for PhysProperties
n_neighbors is 8


/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1943: UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


min_num is 0.1 for NGrams
n_neighbors is 8


/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1879: UserWarning: gradient function is not yet implemented for rogerstanimoto distance metric; inverse_transform will be unavailable
  warn(
/Users/sasha/anaconda3/lib/python3.10/site-packages/umap/umap_.py:1943: UserWarning: n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")

# Choose dimensionality techniques
Available_Dim_red = {'PCA': True, 'UMAP': True}

dim_red_list = []
for key,value in Available_Dim_red.items():
    if bool(value): dim_red_list.append(key)
print(dim_red_list)

['PCA', 'UMAP']

def lister(list1,list2): # Makes combinations of each element in both lists for plotting
    out = []
    for i in list1:
        for l in list2:
            out.append((i,l))
    return out

fig, axes = plt.subplots(len(dim_red_list),len(seq_rep_list),figsize = (6*len(seq_rep_list),5*len(dim_red_list)))
reps_plotting = lister(dim_red_list,seq_rep_list)

for ax, (red,enc) in zip(fig.axes,reps_plotting):
    print(enc,red)
    if red == 'UMAP':
        C1 = data[red][min_dict[enc]][enc][2**n_bors[0]][:,0]
        C2 = data[red][min_dict[enc]][enc][2**n_bors[0]][:,1]
        ax.set_title(f'{red} min_dist {min_dict[enc]} n_neighbors {2**n_bors[0]} with {enc}')

    else:
        C1 = data[red][enc][:,0]
        C2 = data[red][enc][:,1]    
        ax.set_title(f'{red} with {enc}')

    plot = ax.scatter(C1,C2,s=30,marker='o',alpha=0.15,c='tab:blue') 

    ax.set_xlabel(f'{red} {enc} C1')
    ax.set_ylabel(f'{red} {enc} C2')

plt.tight_layout()
#plt.savefig(f'{Savetitle} DimRed Embeddings.png',dpi=300)
plt.show

OneHot PCA
PhysProperties PCA
Fingerprint PCA
NGrams PCA
OneHot UMAP
PhysProperties UMAP
Fingerprint UMAP
NGrams UMAP

<function matplotlib.pyplot.show(close=None, block=None)>

png

Looking at clusters - one from PCA one hot encoding

from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt

df1 = pd.read_csv('./PCA OneHot.csv')

df1.head()

	Peptide	Label	PCA OneHot C1	PCA OneHot C2
0	LPWKWPPFYERF	Nonspecific	0.668123	-0.103624
1	FFFYQMYPWWAY	Nonspecific	-0.020174	0.059053
2	FSVAPHWLWTYH	Nonspecific	0.413959	0.273859
3	WLYLPYWTEHPQ	Nonspecific	0.386628	0.191751
4	WGQMFTWFDHQP	Nonspecific	-0.046315	0.104411

all_column_list = df1.columns.to_list()
C1_col_name = [s for s in all_column_list if 'C1' in s][0]
C2_col_name = [s for s in all_column_list if 'C2' in s][0]
X = np.asarray([(float(x), float(y)) for x, y in zip(df1[C1_col_name], df1[C2_col_name])])

X.shape

(4104, 2)

eps_range = ['']
min_samp_range_start = [25]

for count1,epsy in enumerate(eps_range):
    for count2,min_samp in enumerate(min_samp_range_start):
        model = AgglomerativeClustering(n_clusters=min_samp) # define the model
        print(f'AggCl detection, n_clusters is {str(min_samp)}')
        epsy = ''
              
        yhat = model.fit_predict(X) # fit model, predict clusters. Every row has a cluster associated.
        yhat = yhat + 1 # This is done because clusters start at 0, and we want to label them starting at 1.
        clusters = np.unique(yhat)
        
        
        Center_ave = np.zeros([len(clusters)+1,2]) # +1 is to make both DBSCAN (which has a noise cluster) and AggCl (which does not have a noise cluster) comptabile
        
        
        fig,ax = plt.subplots()
        fig.set_size_inches(12, 9)
        for j,i in enumerate(clusters):
            row_ix = np.where(yhat == i)  # get row indexes for samples with this cluster
            if i != 0:
                # Plot the data
                plt.scatter(X[row_ix, 0], X[row_ix, 1], alpha = 0.3)
                # Plot points that are at the center of each cluster, and annotate them with their autonomous number label
                Center_ave[i,0] = np.average(X[row_ix, 0])
                Center_ave[i,1] = np.average(X[row_ix, 1])
                plt.scatter(Center_ave[i,0], Center_ave[i,1], alpha=0.8, s=50, color='black')
                plt.annotate(i, (Center_ave[i,0], Center_ave[i,1]), fontsize = 30, alpha=1)
                
            if i == 0:
                plt.scatter(X[row_ix, 0], X[row_ix, 1], alpha = 0.3, color='gray') # plot the noise points less intensely

AggCl detection, n_clusters is 25

png