Парсинг Pubmed на противораковые натуральные вещества

в 19:25, , рубрики: python, Алгоритмы, парсинг

В этой статье будет расчет натуральных веществ распарсенных с базы биомедицинских публикаций Pubmed. Для научных целей также полезно высчитывать комбинации молекул, но для этого требуется транскриптом, который можно получить добавляя лекарство на клетку HeLa.

>>> for u in range(1,len(cancelmol)):
...  print(cancelmol[s9[-u]],u)
... 
Curcumin 1
Metformin 2
Resveratrol 3
Quercetin 4
Sulforaphane 5
Melatonin 6
Berberine 7
Celecoxib 8
Triptolide 9
Emodin 10
Simvastatin 11
Trametinib 12
Luteolin 13
Baicalein 14
Plumbagin 15
Celastrol 16
Capecitabine 17
Cryptotanshinone 18
Tocotrienol 19
Astragaloside 20
Fisetin 21
Licochalcone 22
Pterostilbene 23
Nobiletin 24
Kaempferol 25
Chrysin 26
Capsaicin  27
Withaferin A 28
Icariin 29
Epigallocatechin-3-gallate 30
Astaxanthin 31
Xanthohumol 32
Rutin 33
Vitamin E 34
Itraconazole 35
Cannabidiol 36
Naringenin 37
Trichostatin A 38
ursolic acid 39
Myricetin 40
Disulfiram 41
Arctigenin 42
Lutein 43
Phenformin 44
Piceatannol 45
Mifepristone 46
Gallic Acid 47
Minocycline 48
Indirubin 49
Deguelin 50
Polydatin 51
Chlorogenic Acid 52
Theophylline 53
Cardamonin 54
Hesperidin 55
Fucoxanthin 56
Indomethacin 57
Thioridazine 58
Valproic acid 59
Urolithin A 60
Diosgenin  61
Calycosin 62
Naringin 63
Palmatine 64
Isorhamnetin 65
Ivermectin 66
Amlexanox 67
Butein 68
Rosmarinic acid 69
Prunetin 70
Lonidamine 71
Ginkgo biloba extract 72
Doxycycline  73
Juglone 74
Verapamil 75
Spermidine 76
Omega-6 77
Carnosol 78
Menadione 79
Trehalose 80
Sodium salicylate 81
Ellagic acid 82
Auraptene 83
Selenomethionine 84
Dihydroxyflavone 85
6-shogaol 86
Pictilisib 87
Carnosic acid 88
Taxifolin 89
Vitexin 90
Naproxen 91
Geldanamycin 92
Lonicera japonica 93
Nordihydroguaiaretic acid 94
Sesamin 95
Thiostrepton 96
Carnosine 97
Carvedilol 98
Rotenone 99
Captopril 100
Acetazolamide 101
Chlorpromazine 102
Cinnamon Extract 103
Ascorbic acid 104
Scutellarein 105
Caffeic acid 106
Lithium Chloride 107
Anisomycin 108
Carbidopa 109
Echinacoside 110
Canagliflozin 111
Gedunin 112
Chicoric acid 113
SkQ1 114
Enoxacin 115
Green tea extract 116
Norcantharidin 117
Protocatechuic acid 118
Diethyldithiocarbamate 119
Sesamolin 120
Tannic acid 121
Vinpocetine 122
Staurosporine 123
4-phenylbutyrate 124
Reserpine 125
Gramicidin 126
Procaine 127
Glaucarubinone 128
Buthionine Sulfoximine 129
Phosphatidylcholine 130
Buformin 131
Secoisolariciresinol Diglucoside 132
Tamarixetin 133
2-deoxy-D-glucose 134
Limonin 135
Geranylgeranylacetone 136
Astemizole 137
Sorbitol 138
Pyrroloquinoline quinone 139
Imatinib mesylate 140
Alpha-lipoic acid 141
Salicylic acid 142
Sulindac Sulfide 143
Naphthazarin 144
Kinetin 145
Kynurenic acid 146
Dehydroabietic acid 147
Phloridzin 148
Promethazine 149
Wortmannin 150
Cadmium chloride 151
Ganoderma lucidum extract 152
Sodium Nitroprusside 153
Scriptaid 154
Rhodiola rosea extract 155
Ciclopirox olamine 156
Antimycin A 157
Benzethonium 158
Myriocin 159
Coix seed oil 160
Propyl gallate 161
Tectochrysin 162
D-glucosamine 163
Tiliroside 164
Vortioxetin 165
Sodium citrate 166
Mercuric chloride 167
Perphenazine  168
Pinitol 169
Cyproterone acetate 170
from Bio import Entrez
from Bio import Medline
import re
MAX_COUNT = 1000
Entrez.email = 'a-nai@yandex.ru'


import time
import numpy as np
import pickle

drugs2= pickle.load(open("drugs2.pkl","rb"));
cancelmol = pickle.load(open("cancelmol.pkl","rb"));

molecules=['Apigenin', 'Chaetocin', 'Chrysin', 'Curcumin', 'Epigallocatechin gallate (EGCG)', 'Luteolin', 'Myricetin', 'Quercetin', 'Resveratrol', 'Wogonin', 'Brusatol', 'Piperlongumine', 'Trigonelline', 'Pentyl isothiocyanate (PEITC)', 'Pleurotin', 'Plumbagin', 'EM23', 'Parthenolid', 'Sulforaphane', 'Melatonin']

import csv
drugA=[];
with open('/Users/andrejeremcuk/Downloads/DrugAge Browse (1).csv', newline='') as csvfile:
     spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
     for row in spamreader:
       print(row);drugA.append(row)  #//print(', '.join(row))

len(drugA)
drugA[1:5]
drugAb=[]
for i in range(1,len(drugA)): 
 drugAb.append(str(drugA[i][0].split(',')[0]).replace('"', '')) 



s=[];
t=drugAb
for i in t:
       if i not in s:
          s.append(i)


drugA=s;
drugAb=s
def forfer(l,drug7):
     ii0='no';
     t1=l.find(drug7);
     if t1==-1: return 'no';
     t2=l.find('apoptosis');
     t4=l.find('Inhibition of cancer');
     t5=l.find('cancer');
     #t6=re.search('autophagy',fer[u],re.IGNORECASE);
     t6=l.find('autophagy');
     if t6!=None: print(t6,u);
     t7=l.find('Autophagy');
     t8=l.find('anticancer');
     t9=l.find('antiproliferative');
     t10=l.find('apoptotic');
     t11=l.find('Inhibits Cancer Metastasis');
     t12=l.find('Angiogenesis');
     t13=l.find('cytotoxicity');
     t14=l.find('anti-tumor');
     t15=l.find('anticancer therapy');
     t16=l.find('antitumor');
     t17=l.find('inhibits');
     t5=l.find('cancer');
     t18=l.find('cancer growth');
     t19=l.find('cancer therapy');
     t20=l.find('antiangiogenic');
     t21=l.find('anti-cancer');
     if (t21!=-1)or(t20!=-1)or(t19!=-1)or(t18!=-1)or(t17!=-1)or(t16!=-1)or(t15!=-1)or(t14!=-1)or(t13!=-1)or(t12!=-1)or(t11!=-1)or(t10!=-1)or(t9!=-1)or(t8!=-1)or(t7!=-1)or(t6!=-1)or(t4!=-1)or(t2!=-1): return 'yes';
     else: return 'no';

def ferfor(l,drug7):
     t1=l.find(drug7);
     t2=l.find('accelerate');
     t4=l.find('promotes');
     t6=l.find('Stimulates');
     t7=l.find('Metastasis');
     t5=l.find('cancer');
     t8=l.find('promote');
     t9=l.find('tumorigenesis');
     t10=l.find('metastasis');
     t11=l.find('stimulates');
     if ((t11!=-1)or(t10!=-1)or(t9!=-1)or(t8!=-1)or(t7!=-1)or(t6!=-1)or(t4!=-1)or(t2!=-1))and(t1!=-1): return 'yes';
     else: return 'no';
     
     
     
     cancers=['Liver Cancer', 'Thyroid Cancer', 'Pancreatic Cancer', 'Leukemia', 'Kidney Cancer', 'Non-Hodgkins Lymphoma', 'Bladder Cancer', 'Melanoma', 'Colonorectal Cancers', 'Prostate Cancer', 'Lung Cancer', 'Breast Cancer', 'squamous cell carcinoma', 'gastric cancer'];
     
for u in range(len(drugA)):#len(drugA)
 MAX_COUNT = 400;
 TERM=drugA[u] +' cancer';print(drugA[u],u);ii7.append(('it',drugA[u],u));ii6.append(('it',drugA[u],u))
 try: h=Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
 except: time.sleep(5);print('error',u);h=Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
 result = Entrez.read(h)
 ids = result['IdList']
 h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
 ret = Medline.parse(h)
 fer=[];
 for re in ret:
  try: tr=re['TI'];
  except: tr='0';
  fer.append(tr);
 for m in range(len(fer)):
  i=drugA[u].find(' ');
  if i!=-1: i7=forfer(fer[m],drugA[u][:i]);
  s=drugA[u];
  i8=forfer(fer[m],s);
  s=s[0].lower()+s[1:];
  i9=forfer(fer[m],s);
  i6=forfer(fer[m],s);
  if i6=='yes': ii6.append((fer[m],drugA[u]));
  if (i9=='yes')or(i8=='yes')or(i7=='yes'): ii7.append((fer[m],drugA[u]));


     
    
iii7=[];

for u in range(len(s)):#len(drugA)
 iii7.append(('it',s[u],u));
 for h in range(len(cancers)):#len(drugA)
  MAX_COUNT = 400;
  TERM=s[u] +' '+cancers[h];print(s[u],u);#iii7.append(('it',s[u],u));
  try: hi=Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
  except: time.sleep(5);print('error',u);hi=Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
  result = Entrez.read(hi)
  ids = result['IdList']
  hi = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
  ret = Medline.parse(hi)
  fer=[];
  for re1 in ret:
   try: tr=re1['TI'];
   except: tr='0';
   fer.append(tr);
  for m in range(len(fer)):
   if (re.search(cancers[h],fer[m],re.IGNORECASE))and(re.search(s[u],fer[m],re.IGNORECASE)): iii7.append((fer[m],s[u],cancers[h]));
i8cm = pickle.load(open("i8cm.pkl","rb"));

cancelmol = pickle.load(open("cancelmol.pkl","rb"));

i7cm = pickle.load(open("i7cm.pkl","rb"));

ma00pp = pickle.load(open("ma00pp.pkl","rb"));

Github Code

Автор: Андрей Еремчук

Источник

* - обязательные к заполнению поля


https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js