mirror of
https://github.com/BaranziniLab/KG_RAG.git
synced 2024-06-08 14:12:54 +03:00
adding new notebooks
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,3 +1,4 @@
|
||||
logs/
|
||||
data/results/cypher_rag_output_2M_spoke.csv
|
||||
test_notebooks/create_node_context_file_for_2M_spoke_graph.ipynb
|
||||
notebooks/cypher_rag_using_langchain_2M.ipynb
|
||||
|
||||
323
data/dataset_for_entity_retrieval_accuracy_analysis.csv
Normal file
323
data/dataset_for_entity_retrieval_accuracy_analysis.csv
Normal file
@@ -0,0 +1,323 @@
|
||||
text,node_hits
|
||||
LIRAGLUTIDE TREATS OBESITY,obesity
|
||||
disease ontology identifier for central diabetes insipidus is doid:350,central diabetes insipidus
|
||||
"Xeroderma pigmentosum, group G is not associated with Gene ERCC5",xeroderma pigmentosum
|
||||
cherubism is not a autosomal dominant disease,cherubism
|
||||
MASA SYNDROME (DISORDER) IS NOT ASSOCIATED WITH GENE L1CAM,MASA syndrome
|
||||
CONGENITAL GENERALIZED LIPODYSTROPHY TYPE 2 ASSOCIATES GENE BSCL2,congenital generalized lipodystrophy type 2
|
||||
PRASTERONE TREATS OBESITY,obesity
|
||||
CONGENITAL CONTRACTURAL ARACHNODACTYLY ASSOCIATES GENE FBN2,congenital contractural arachnodactyly
|
||||
PAROXYSMAL NONKINESIGENIC DYSKINESIA 1 IS NOT ASSOCIATED WITH GENE PNKD,paroxysmal nonkinesigenic dyskinesia 1
|
||||
Acute intermittent porphyria is not associated with Gene HMBS,acute intermittent porphyria
|
||||
Disease ontology identifier for gray platelet syndrome is DOID:0111044,gray platelet syndrome
|
||||
Hyperargininemia is not associated with Gene ARG1,hyperargininemia
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR MARFAN SYNDROME IS DOID:0060055,Marfan syndrome
|
||||
FACTOR IX TREATS HEMOPHILIA B,hemophilia B
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR MOSAIC VARIEGATED ANEUPLOIDY SYNDROME 1 IS DOID:0080141,mosaic variegated aneuploidy syndrome 1
|
||||
noonan syndrome associates gene kras,Noonan syndrome
|
||||
L-2-HYDROXYGLUTARIC ACIDURIA associates Gene L2HGDH,L-2-hydroxyglutaric aciduria
|
||||
AZITHROMYCIN TREATS CYSTIC FIBROSIS,cystic fibrosis
|
||||
disease ontology identifier for smith-magenis syndrome is doid:12271,Smith-Magenis syndrome
|
||||
COFFIN-SIRIS SYNDROME IS A SYNDROMIC DISEASE,Coffin-Siris syndrome
|
||||
antithrombin iii deficiency is not associated with gene serpinc1,antithrombin III deficiency
|
||||
DEHYDROEPIANDROSTERONE TREATS OBESITY,obesity
|
||||
MULIBREY NANISM IS A SYNDROMIC DISEASE,mulibrey nanism
|
||||
cleidocranial dysplasia associates gene runx2,cleidocranial dysplasia
|
||||
ASPARTYLGLUCOSAMINURIA IS NOT ASSOCIATED WITH GENE AGA,aspartylglucosaminuria
|
||||
Brachydactyly type C is not associated with Gene GDF5,brachydactyly type C
|
||||
Wolman Disease associates Gene LIPA,Wolman disease
|
||||
adenine phosphoribosyltransferase deficiency associates gene aprt,adenine phosphoribosyltransferase deficiency
|
||||
Neurofibromatosis 2 is not associated with Gene NF2,neurofibromatosis 2
|
||||
HARTNUP DISEASE IS A INBORN DISORDER OF AMINO ACID TRANSPORT,Hartnup disease
|
||||
campomelic dysplasia associates gene sox9,campomelic dysplasia
|
||||
DOYNE HONEYCOMB RETINAL DYSTROPHY associates Gene EFEMP1,Doyne honeycomb retinal dystrophy
|
||||
enhanced S-cone syndrome is not a vitreoretinal degeneration,enhanced S-cone syndrome
|
||||
melanoma associates Gene BRAF,melanoma
|
||||
congenital contractural arachnodactyly is not a congenital nervous system disorder,congenital contractural arachnodactyly
|
||||
CONGENITAL AMEGAKARYOCYTIC THROMBOCYTOPENIA IS NOT ASSOCIATED WITH GENE MPL,congenital amegakaryocytic thrombocytopenia
|
||||
LEIGH DISEASE ASSOCIATES GENE SURF1,Leigh disease
|
||||
cystic fibrosis is a respiratory system disorder,cystic fibrosis
|
||||
Neurofibromatosis 1 associates Gene NF1,neurofibromatosis 1
|
||||
"Robinow syndrome, autosomal recessive associates Gene ROR2",Robinow syndrome
|
||||
polycythemia vera is not associated with gene jak2,polycythemia vera
|
||||
PSEUDOPSEUDOHYPOPARATHYROIDISM ASSOCIATES GENE GNAS,pseudopseudohypoparathyroidism
|
||||
Rothmund-Thomson syndrome is not associated with Gene RECQL4,Rothmund-Thomson syndrome
|
||||
Alveolar rhabdomyosarcoma associates Gene PAX3,alveolar rhabdomyosarcoma
|
||||
ellis-van creveld syndrome associates gene evc2,Ellis-Van Creveld syndrome
|
||||
Coffin-Lowry syndrome is not a X-linked syndromic intellectual disability,Coffin-Lowry syndrome
|
||||
CHERUBISM ASSOCIATES GENE SH3BP2,cherubism
|
||||
agalsidase alfa treats fabry disease,Fabry disease
|
||||
disease ontology identifier for lesch-nyhan syndrome is doid:1919,Lesch-Nyhan syndrome
|
||||
disease ontology identifier for autosomal dominant hypophosphatemic rickets is doid:0050948,autosomal dominant hypophosphatemic rickets
|
||||
nail-patella syndrome associates gene lmx1b,nail-patella syndrome
|
||||
mulibrey nanism is not associated with gene trim37,mulibrey nanism
|
||||
MYOCLONIC DYSTONIA IS NOT ASSOCIATED WITH GENE SGCE,myoclonic dystonia
|
||||
pseudoachondroplasia associates gene comp,pseudoachondroplasia
|
||||
EVEROLIMUS TREATS TUBEROUS SCLEROSIS,tuberous sclerosis
|
||||
DiGeorge syndrome is a congenital T-cell immunodeficiency,DiGeorge syndrome
|
||||
INSULIN TREATS OBESITY,obesity
|
||||
disease ontology identifier for mastocytosis is doid:0060768,mastocytosis
|
||||
bevacizumab treats hereditary hemorrhagic telangiectasia,hereditary hemorrhagic telangiectasia
|
||||
Disease ontology identifier for Farber lipogranulomatosis is DOID:0050464,Farber lipogranulomatosis
|
||||
DENYS-DRASH SYNDROME IS NOT ASSOCIATED WITH GENE WT1,Denys-Drash syndrome
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR NORRIE DISEASE IS DOID:0060599,Norrie disease
|
||||
"XERODERMA PIGMENTOSUM, COMPLEMENTATION GROUP E ASSOCIATES GENE DDB2",xeroderma pigmentosum
|
||||
Disease ontology identifier for popliteal pterygium syndrome is DOID:0050756,popliteal pterygium syndrome
|
||||
Ornithine carbamoyltransferase deficiency associates Gene OTC,ornithine carbamoyltransferase deficiency
|
||||
Wiskott-Aldrich Syndrome is not associated with Gene WAS,Wiskott-Aldrich syndrome
|
||||
Tangier Disease is not associated with Gene ABCA1,Tangier disease
|
||||
disease ontology identifier for omenn syndrome is doid:3633,Omenn syndrome
|
||||
LONG QT SYNDROME 1 ASSOCIATES GENE KCNQ1,long QT syndrome 1
|
||||
allan-herndon-dudley syndrome (ahds) is not associated with gene slc16a2,Allan-Herndon-Dudley syndrome
|
||||
ETHYLMALONIC ENCEPHALOPATHY IS NOT ASSOCIATED WITH GENE ETHE1,ethylmalonic encephalopathy
|
||||
Hemophilia B is not associated with Gene F9,hemophilia B
|
||||
Mowat-Wilson syndrome is not associated with Gene ZEB2,Mowat-Wilson syndrome
|
||||
HARTNUP DISEASE IS NOT ASSOCIATED WITH GENE SLC6A19,Hartnup disease
|
||||
Disease ontology identifier for campomelic dysplasia is DOID:0050463,campomelic dysplasia
|
||||
"HEMOCHROMATOSIS, TYPE 4 ASSOCIATES GENE SLC40A1",hemochromatosis
|
||||
disease ontology identifier for rothmund-thomson syndrome is doid:2732,Rothmund-Thomson syndrome
|
||||
Autosomal Recessive Polycystic Kidney Disease associates Gene PKHD1,autosomal recessive polycystic kidney disease
|
||||
adenine phosphoribosyltransferase deficiency is a inborn disorder of amino acid metabolism,adenine phosphoribosyltransferase deficiency
|
||||
Angelman syndrome is a syndromic disease,Angelman syndrome
|
||||
Tay-Sachs disease is not a eye degenerative disorder,Tay-Sachs disease
|
||||
spinocerebellar ataxia type 5 associates gene sptbn2,spinocerebellar ataxia type 5
|
||||
rapp-hodgkin syndrome is a autosomal dominant disease,Rapp-Hodgkin syndrome
|
||||
sandhoff disease is not associated with gene hexb,Sandhoff disease
|
||||
johanson-blizzard syndrome associates gene ubr1,Johanson-Blizzard syndrome
|
||||
maple syrup urine disease associates gene dbt,maple syrup urine disease
|
||||
laron syndrome is a autosomal recessive disease,Laron syndrome
|
||||
popliteal pterygium syndrome is not associated with gene irf6,popliteal pterygium syndrome
|
||||
PITT-HOPKINS SYNDROME is not associated with Gene TCF4,Pitt-Hopkins syndrome
|
||||
Alkaptonuria associates Gene HGD,alkaptonuria
|
||||
Hereditary hemorrhagic telangiectasia associates Gene ENG,hereditary hemorrhagic telangiectasia
|
||||
MASTOCYTOSIS IS NOT ASSOCIATED WITH GENE KIT,mastocytosis
|
||||
macrolide antibiotics treats cystic fibrosis,cystic fibrosis
|
||||
Very long chain acyl-CoA dehydrogenase deficiency is not associated with Gene ACADVL,very long chain acyl-CoA dehydrogenase deficiency
|
||||
KUFOR-RAKEB SYNDROME associates Gene ATP13A2,Kufor-Rakeb syndrome
|
||||
protein-tyrosine kinase inhibitor treats sarcoma,sarcoma
|
||||
autosomal dominant hypophosphatemic rickets associates gene fgf23,autosomal dominant hypophosphatemic rickets
|
||||
WAARDENBURG SYNDROME TYPE 1 ASSOCIATES GENE PAX3,Waardenburg syndrome type 1
|
||||
Cystic Fibrosis associates Gene CFTR,cystic fibrosis
|
||||
WHIM syndrome is not associated with Gene CXCR4,WHIM syndrome
|
||||
tuberous sclerosis is a autosomal dominant disease,tuberous sclerosis
|
||||
CHOROIDEREMIA IS NOT ASSOCIATED WITH GENE CHM,choroideremia
|
||||
smith-lemli-opitz syndrome is not associated with gene dhcr7,Smith-Lemli-Opitz syndrome
|
||||
LATE-ONSET RETINAL DEGENERATION (disorder) associates Gene C1QTNF5,late-onset retinal degeneration
|
||||
ibuprofen treats cystic fibrosis,cystic fibrosis
|
||||
JUVENILE POLYPOSIS SYNDROME ASSOCIATES GENE SMAD4,juvenile polyposis syndrome
|
||||
alpha-galactosidase treats fabry disease,Fabry disease
|
||||
pembrolizumab treats melanoma,melanoma
|
||||
PIEBALDISM ASSOCIATES GENE KIT,piebaldism
|
||||
"Charcot-Marie-Tooth Disease, Type Ib associates Gene MPZ",Charcot-Marie-Tooth disease
|
||||
DiGeorge Syndrome is not associated with Gene TBX1,DiGeorge syndrome
|
||||
Fibrodysplasia Ossificans Progressiva associates Gene ACVR1,fibrodysplasia ossificans progressiva
|
||||
Omenn Syndrome associates Gene RAG2,Omenn syndrome
|
||||
Juvenile Spinal Muscular Atrophy associates Gene SMN1,juvenile spinal muscular atrophy
|
||||
antibiotics treats cystic fibrosis,cystic fibrosis
|
||||
HEREDITARY HEMORRHAGIC TELANGIECTASIA IS A AUTOSOMAL DOMINANT DISEASE,hereditary hemorrhagic telangiectasia
|
||||
Fabry Disease associates Gene GLA,Fabry disease
|
||||
GYRATE ATROPHY ASSOCIATES GENE OAT,gyrate atrophy
|
||||
"Bernard-Soulier syndrome is a inherited bleeding disorder, platelet-type",Bernard-Soulier syndrome
|
||||
VITELLIFORM MACULAR DYSTROPHY ASSOCIATES GENE BEST1,vitelliform macular dystrophy
|
||||
Gray Platelet Syndrome is not associated with Gene NBEAL2,gray platelet syndrome
|
||||
Coffin-Siris syndrome associates Gene ARID1B,Coffin-Siris syndrome
|
||||
burkitt lymphoma is a neoplasm of mature b-cells,Burkitt lymphoma
|
||||
multiple endocrine neoplasia type 2b associates gene ret,multiple endocrine neoplasia type 2B
|
||||
anesthetics treats pheochromocytoma,pheochromocytoma
|
||||
PSEUDOACHONDROPLASIA IS A OSTEOCHONDRODYSPLASIA,pseudoachondroplasia
|
||||
Pierson syndrome is not a autosomal recessive disease,Pierson syndrome
|
||||
costello syndrome (disorder) is not associated with gene hras,Costello syndrome
|
||||
familial mediterranean fever associates gene mefv,familial Mediterranean fever
|
||||
Jervell-Lange Nielsen Syndrome is not associated with Gene KCNQ1,Jervell-Lange Nielsen syndrome
|
||||
argininosuccinic aciduria associates gene asl,argininosuccinic aciduria
|
||||
nance-horan syndrome associates gene nhs,Nance-Horan syndrome
|
||||
li-fraumeni syndrome is a autosomal dominant disease,Li-Fraumeni syndrome
|
||||
CANAVAN DISEASE IS A INBORN AMINOACYLASE DEFICIENCY,Canavan disease
|
||||
TETRALOGY OF FALLOT IS NOT A HEREDITARY DISEASE,tetralogy of Fallot
|
||||
Disease ontology identifier for fibrodysplasia ossificans progressiva is DOID:13374,fibrodysplasia ossificans progressiva
|
||||
penicillamine treats cystinuria,cystinuria
|
||||
LONG QT SYNDROME 3 associates Gene SCN5A,long QT syndrome 3
|
||||
"xeroderma pigmentosum, complementation group c associates gene xpc",xeroderma pigmentosum
|
||||
Laron Syndrome associates Gene GHR,Laron syndrome
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR PHEOCHROMOCYTOMA IS DOID:14692,pheochromocytoma
|
||||
CYSTINURIA IS NOT ASSOCIATED WITH GENE SLC3A1,cystinuria
|
||||
loeys-dietz syndrome associates gene tgfbr1,Loeys-Dietz syndrome
|
||||
ANTIBIOTICS TREATS OBESITY,obesity
|
||||
Disease ontology identifier for ethylmalonic encephalopathy is DOID:0060640,ethylmalonic encephalopathy
|
||||
Johanson-Blizzard syndrome is a congenital nervous system disorder,Johanson-Blizzard syndrome
|
||||
peutz-jeghers syndrome associates gene stk11,Peutz-Jeghers syndrome
|
||||
Sandhoff disease is a eye degenerative disorder,Sandhoff disease
|
||||
Coffin-Lowry syndrome associates Gene RPS6KA3,Coffin-Lowry syndrome
|
||||
Leigh Disease associates Gene NDUFS4,Leigh disease
|
||||
choroideremia is not a X-linked disease,choroideremia
|
||||
Bernard-Soulier Syndrome associates Gene GP1BB,Bernard-Soulier syndrome
|
||||
Lafora Disease is not associated with Gene NHLRC1,Lafora disease
|
||||
IMMUNOSUPPRESSIVE AGENTS TREATS CROHN'S DISEASE,Crohn's disease
|
||||
LAFORA DISEASE ASSOCIATES GENE EPM2A,Lafora disease
|
||||
Pheochromocytoma is not associated with Gene RET,pheochromocytoma
|
||||
Brugada Syndrome (disorder) associates Gene SCN5A,Brugada syndrome
|
||||
Greig cephalopolysyndactyly syndrome associates Gene GLI3,Greig cephalopolysyndactyly syndrome
|
||||
vaccines treats melanoma,melanoma
|
||||
Mucopolysaccharidosis II associates Gene IDS,mucopolysaccharidosis II
|
||||
RAPP-HODGKIN SYNDROME associates Gene TP63,Rapp-Hodgkin syndrome
|
||||
Spinocerebellar Ataxia Type 6 (disorder) associates Gene CACNA1A,spinocerebellar ataxia type 6
|
||||
Saethre-Chotzen Syndrome is not associated with Gene TWIST1,Saethre-Chotzen syndrome
|
||||
Loeys-Dietz Syndrome associates Gene TGFBR2,Loeys-Dietz syndrome
|
||||
Ulnar-mammary syndrome is not associated with Gene TBX3,ulnar-mammary syndrome
|
||||
Marfan Syndrome associates Gene FBN1,Marfan syndrome
|
||||
noonan syndrome associates gene raf1,Noonan syndrome
|
||||
norrie disease associates gene ndp,Norrie disease
|
||||
biotinidase deficiency is not a multiple carboxylase deficiency,biotinidase deficiency
|
||||
burkitt lymphoma is not associated with gene myc,Burkitt lymphoma
|
||||
hyperkalemic periodic paralysis is not a familial periodic paralysis,hyperkalemic periodic paralysis
|
||||
TUBEROUS SCLEROSIS ASSOCIATES GENE TSC2,tuberous sclerosis
|
||||
melphalan treats melanoma,melanoma
|
||||
Disease ontology identifier for beta-mannosidosis is DOID:0111136,beta-mannosidosis
|
||||
pitt-hopkins syndrome is a syndromic disease,Pitt-Hopkins syndrome
|
||||
Fatal Familial Insomnia is not associated with Gene PRNP,fatal familial insomnia
|
||||
FABRY DISEASE IS NOT A DEVELOPMENTAL ANOMALY OF METABOLIC ORIGIN,Fabry disease
|
||||
fatal familial insomnia is not a insomnia,fatal familial insomnia
|
||||
MUENKE SYNDROME ASSOCIATES GENE FGFR3,Muenke Syndrome
|
||||
FARBER LIPOGRANULOMATOSIS IS NOT ASSOCIATED WITH GENE ASAH1,Farber lipogranulomatosis
|
||||
greig cephalopolysyndactyly syndrome is not a syndromic disease,Greig cephalopolysyndactyly syndrome
|
||||
Kleefstra syndrome 1 is a Kleefstra syndrome,Kleefstra syndrome 1
|
||||
mowat-wilson syndrome is a syndromic intellectual disability,Mowat-Wilson syndrome
|
||||
METRONIDAZOLE TREATS CROHN'S DISEASE,Crohn's disease
|
||||
Cystinuria is not associated with Gene SLC7A9,cystinuria
|
||||
Disease ontology identifier for Smith-Lemli-Opitz syndrome is DOID:0080026,Smith-Lemli-Opitz syndrome
|
||||
alpha-d-galactosidase enzyme treats fabry disease,Fabry disease
|
||||
Tay-Sachs Disease associates Gene HEXA,Tay-Sachs disease
|
||||
BIOTINIDASE DEFICIENCY IS NOT ASSOCIATED WITH GENE BTD,biotinidase deficiency
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR ATAXIA TELANGIECTASIA IS DOID:0060010,ataxia telangiectasia
|
||||
Disease ontology identifier for Timothy syndrome is DOID:0060173,Timothy syndrome
|
||||
multiple endocrine neoplasia type 2b is a autosomal dominant disease,multiple endocrine neoplasia type 2B
|
||||
WOLCOTT-RALLISON SYNDROME IS A SYNDROMIC DISEASE,Wolcott-Rallison syndrome
|
||||
Disease ontology identifier for cystinuria is DOID:9266,cystinuria
|
||||
POLYCYSTIC KIDNEY DISEASE 1 ASSOCIATES GENE PKD1,polycystic kidney disease 1
|
||||
Refsum Disease associates Gene PHYH,Refsum disease
|
||||
Nijmegen breakage syndrome is a autosomal recessive disease,Nijmegen breakage syndrome
|
||||
Pierson syndrome is not associated with Gene LAMB2,Pierson syndrome
|
||||
holt-oram syndrome is a autosomal dominant disease,Holt-Oram syndrome
|
||||
Fragile X Syndrome associates Gene FMR1,fragile X syndrome
|
||||
vitelliform macular dystrophy is not a macular degeneration,vitelliform macular dystrophy
|
||||
Pfeiffer Syndrome associates Gene FGFR2,Pfeiffer syndrome
|
||||
Alexander Disease associates Gene GFAP,Alexander disease
|
||||
"basal ganglia disease, biotin-responsive associates gene slc19a3",basal ganglia disease
|
||||
Progeria associates Gene LMNA,progeria
|
||||
infantile hypophosphatasia associates gene alpl,infantile hypophosphatasia
|
||||
"xeroderma pigmentosum, group b associates gene ercc3",xeroderma pigmentosum
|
||||
Microvillus inclusion disease is not associated with Gene MYO5B,microvillus inclusion disease
|
||||
Smith-Magenis syndrome associates Gene RAI1,Smith-Magenis syndrome
|
||||
LIVER CARCINOMA IS NOT ASSOCIATED WITH GENE MET,liver carcinoma
|
||||
"fanconi anemia, complementation group d2 is not associated with gene fancd2",Fanconi anemia
|
||||
tetralogy of fallot associates gene zfpm2,tetralogy of Fallot
|
||||
ARGIPRESSIN TREATS CENTRAL DIABETES INSIPIDUS,central diabetes insipidus
|
||||
noonan syndrome associates gene sos1,Noonan syndrome
|
||||
Denys-Drash syndrome is a autosomal dominant disease,Denys-Drash syndrome
|
||||
familial Mediterranean fever is not a primary immunodeficiency due to a genetic defect in innate immunity,familial Mediterranean fever
|
||||
Disease ontology identifier for ornithine carbamoyltransferase deficiency is DOID:9271,ornithine carbamoyltransferase deficiency
|
||||
Achondroplasia is not a osteochondrodysplasia,achondroplasia
|
||||
unverricht-lundborg syndrome is not associated with gene cstb,Unverricht-Lundborg syndrome
|
||||
alpha-Mannosidosis associates Gene MAN2B1,alpha-mannosidosis
|
||||
6-MERCAPTOPURINE TREATS CROHN'S DISEASE,Crohn's disease
|
||||
enhanced s-cone syndrome is not associated with gene nr2e3,enhanced S-cone syndrome
|
||||
ADRENAL CORTEX HORMONES TREATS CROHN'S DISEASE,Crohn's disease
|
||||
BIETTI CRYSTALLINE CORNEORETINAL DYSTROPHY is not associated with Gene CYP4V2,Bietti crystalline corneoretinal dystrophy
|
||||
AGALSIDASE BETA TREATS FABRY DISEASE,Fabry disease
|
||||
HEMOPHILIA B IS A HEMORRHAGIC DISEASE,hemophilia B
|
||||
Li-Fraumeni Syndrome is not associated with Gene TP53,Li-Fraumeni syndrome
|
||||
"xeroderma pigmentosum, group f associates gene ercc4",xeroderma pigmentosum
|
||||
WOLMAN DISEASE IS A LYSOSOMAL ACID LIPASE DEFICIENCY,Wolman disease
|
||||
alveolar rhabdomyosarcoma is not a rhabdomyosarcoma,alveolar rhabdomyosarcoma
|
||||
Aniridia is not associated with Gene PAX6,aniridia
|
||||
argininosuccinic aciduria is a amino acid metabolism disease,argininosuccinic aciduria
|
||||
"charcot-marie-tooth disease, type 4c associates gene sh3tc2",Charcot-Marie-Tooth disease
|
||||
Werner Syndrome associates Gene WRN,Werner syndrome
|
||||
AMILORIDE TREATS CYSTIC FIBROSIS,cystic fibrosis
|
||||
nail-patella syndrome is a autosomal dominant disease,nail-patella syndrome
|
||||
"TIBIAL MUSCULAR DYSTROPHY, TARDIVE associates Gene TTN",tibial muscular dystrophy
|
||||
Chediak-Higashi Syndrome associates Gene LYST,Chediak-Higashi syndrome
|
||||
juvenile myoclonic epilepsy is not associated with gene efhc1,juvenile myoclonic epilepsy
|
||||
UNVERRICHT-LUNDBORG SYNDROME IS A MOVEMENT DISORDER,Unverricht-Lundborg syndrome
|
||||
immune checkpoint inhibitors treats melanoma,melanoma
|
||||
hyperkalemic periodic paralysis is not associated with gene scn4a,hyperkalemic periodic paralysis
|
||||
Disease ontology identifier for Doyne honeycomb retinal dystrophy is DOID:0081055,Doyne honeycomb retinal dystrophy
|
||||
"Charcot-Marie-Tooth disease, Type 4B1 associates Gene MTMR2",Charcot-Marie-Tooth disease
|
||||
disease ontology identifier for fragile x syndrome is doid:14261,fragile X syndrome
|
||||
Nijmegen Breakage Syndrome is not associated with Gene NBN,Nijmegen breakage syndrome
|
||||
MERCAPTOPURINE TREATS CROHN'S DISEASE,Crohn's disease
|
||||
Alexander disease is a leukodystrophy,Alexander disease
|
||||
disease ontology identifier for werner syndrome is doid:0050466,Werner syndrome
|
||||
spinocerebellar ataxia type 1 is not associated with gene atxn1,spinocerebellar ataxia type 1
|
||||
ACRODERMATITIS ENTEROPATHICA IS NOT A INBORN METAL METABOLISM DISORDER,acrodermatitis enteropathica
|
||||
Adrenoleukodystrophy associates Gene ABCD1,adrenoleukodystrophy
|
||||
"Xeroderma pigmentosum, group A associates Gene XPA",xeroderma pigmentosum
|
||||
Wolcott-Rallison syndrome associates Gene EIF2AK3,Wolcott-Rallison syndrome
|
||||
Mucopolysaccharidosis VI associates Gene ARSB,mucopolysaccharidosis VI
|
||||
very long chain acyl-coa dehydrogenase deficiency is not a disorder of fatty acid oxidation and ketogenesis,very long chain acyl-CoA dehydrogenase deficiency
|
||||
OBESITY IS NOT ASSOCIATED WITH GENE PPARG,obesity
|
||||
angelman syndrome is not associated with gene ube3a,Angelman syndrome
|
||||
pelizaeus-merzbacher disease associates gene plp1,Pelizaeus-Merzbacher disease
|
||||
Ellis-van Creveld syndrome is not a heart disorder,Ellis-Van Creveld syndrome
|
||||
kleefstra syndrome 1 is not associated with gene ehmt1,Kleefstra syndrome 1
|
||||
COLCHICINE TREATS FAMILIAL MEDITERRANEAN FEVER,familial Mediterranean fever
|
||||
"CHARCOT-MARIE-TOOTH DISEASE, TYPE 4J associates Gene FIG4",Charcot-Marie-Tooth disease
|
||||
sitosterolemia is not associated with gene abcg8,sitosterolemia
|
||||
Holt-Oram syndrome is not associated with Gene TBX5,Holt-Oram syndrome
|
||||
OBESITY IS NOT ASSOCIATED WITH GENE MC4R,obesity
|
||||
Disease ontology identifier for Lafora disease is DOID:3534,Lafora disease
|
||||
Achondroplasia is not associated with Gene FGFR3,achondroplasia
|
||||
BETA-MANNOSIDOSIS IS NOT ASSOCIATED WITH GENE MANBA,beta-mannosidosis
|
||||
BORJESON-FORSSMAN-LEHMANN SYNDROME IS NOT A X-LINKED SYNDROMIC INTELLECTUAL DISABILITY,Borjeson-Forssman-Lehmann syndrome
|
||||
Disease ontology identifier for spinocerebellar ataxia type 1 is DOID:0050954,spinocerebellar ataxia type 1
|
||||
Disease ontology identifier for Tangier disease is DOID:1388,Tangier disease
|
||||
sarcoma is a cancer,sarcoma
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR JUVENILE MYOCLONIC EPILEPSY IS DOID:4890,juvenile myoclonic epilepsy
|
||||
Disease ontology identifier for aniridia is DOID:12704,aniridia
|
||||
CHARGE Syndrome is not associated with Gene CHD7,CHARGE syndrome
|
||||
Creutzfeldt-Jakob disease is not associated with Gene PRNP,Creutzfeldt-Jakob disease
|
||||
central diabetes insipidus associates gene avp,central diabetes insipidus
|
||||
Chediak-Higashi syndrome is a congenital nervous system disorder,Chediak-Higashi syndrome
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR PELIZAEUS-MERZBACHER DISEASE IS DOID:5688,Pelizaeus-Merzbacher disease
|
||||
Borjeson-Forssman-Lehmann syndrome is not associated with Gene PHF6,Borjeson-Forssman-Lehmann syndrome
|
||||
Juvenile polyposis syndrome associates Gene BMPR1A,juvenile polyposis syndrome
|
||||
enoxaparin treats obesity,obesity
|
||||
Ataxia Telangiectasia associates Gene ATM,ataxia telangiectasia
|
||||
Mosaic variegated aneuploidy syndrome 1 associates Gene BUB1B,mosaic variegated aneuploidy syndrome 1
|
||||
bernard-soulier syndrome associates gene gp1ba,Bernard-Soulier syndrome
|
||||
PSEUDOXANTHOMA ELASTICUM ASSOCIATES GENE ABCC6,pseudoxanthoma elasticum
|
||||
ALSTROM SYNDROME ASSOCIATES GENE ALMS1,Alstrom syndrome
|
||||
osteosarcoma is a sarcoma,osteosarcoma
|
||||
"XERODERMA PIGMENTOSUM, COMPLEMENTATION GROUP D ASSOCIATES GENE ERCC2",xeroderma pigmentosum
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR PSEUDOPSEUDOHYPOPARATHYROIDISM IS DOID:4183,pseudopseudohypoparathyroidism
|
||||
Hajdu-Cheney Syndrome associates Gene NOTCH2,Hajdu-Cheney syndrome
|
||||
X-linked agammaglobulinemia associates Gene BTK,X-linked agammaglobulinemia
|
||||
prothrombin complex concentrates treats hemophilia b,hemophilia B
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR ADRENOLEUKODYSTROPHY IS DOID:0060844,adrenoleukodystrophy
|
||||
nitisinone treats alkaptonuria,alkaptonuria
|
||||
l-2-hydroxyglutaric aciduria is not a 2-hydroxyglutaric aciduria,L-2-hydroxyglutaric aciduria
|
||||
alkaptonuria is not a disorder of tyrosine metabolism,alkaptonuria
|
||||
Sarcoma associates Gene TP53,sarcoma
|
||||
ACRODERMATITIS ENTEROPATHICA ASSOCIATES GENE SLC39A4,acrodermatitis enteropathica
|
||||
Disease ontology identifier for spinocerebellar ataxia type 5 is DOID:0050882,spinocerebellar ataxia type 5
|
||||
canavan disease associates gene aspa,Canavan disease
|
||||
disease ontology identifier for sitosterolemia is doid:0090019,sitosterolemia
|
||||
Liver carcinoma associates Gene TP53,liver carcinoma
|
||||
Variant rs2476601 associates Rheumatoid Arthritis,rheumatoid arthritis
|
||||
Osteosarcoma is not associated with Gene TP53,osteosarcoma
|
||||
EPISODIC ATAXIA TYPE 2 (DISORDER) IS NOT ASSOCIATED WITH GENE CACNA1A,episodic ataxia type 2
|
||||
lesch-nyhan syndrome associates gene hprt1,Lesch-Nyhan syndrome
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR WISKOTT-ALDRICH SYNDROME IS DOID:9169,Wiskott-Aldrich syndrome
|
||||
Alstrom syndrome is a ciliopathy,Alstrom syndrome
|
||||
Carney Complex is not associated with Gene PRKAR1A,Carney complex
|
||||
DISEASE ONTOLOGY IDENTIFIER FOR CONGENITAL GENERALIZED LIPODYSTROPHY TYPE 2 IS DOID:10588,congenital generalized lipodystrophy type 2
|
||||
brachydactyly type c is a brachydactyly,brachydactyly type C
|
||||
noonan syndrome is a multiple congenital anomalies/dysmorphic syndrome-variable intellectual disability syndrome,Noonan syndrome
|
||||
Disease ontology identifier for Nance-Horan syndrome is DOID:0050771,Nance-Horan syndrome
|
||||
Carney complex is a autosomal dominant disease,Carney complex
|
||||
immune checkpoint inhibitor treats melanoma,melanoma
|
||||
antineoplastic agents treats osteosarcoma,osteosarcoma
|
||||
timothy syndrome associates gene cacna1c,Timothy syndrome
|
||||
piebaldism is a autosomal dominant disease,piebaldism
|
||||
Disease ontology identifier for Loeys-Dietz syndrome is DOID:0060745,Loeys-Dietz syndrome
|
||||
NOONAN SYNDROME ASSOCIATES GENE PTPN11,Noonan syndrome
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -26,12 +26,13 @@ SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
|
||||
|
||||
CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
|
||||
|
||||
save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_two_hop_mcq_from_monarch_and_robokop_response.csv"
|
||||
save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv"
|
||||
|
||||
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
@@ -39,7 +40,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
|
||||
output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
|
||||
answer_list.append((row["text"], row["correct_node"], output))
|
||||
|
||||
@@ -31,6 +31,7 @@ CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
|
||||
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
@@ -41,7 +42,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
|
||||
output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
|
||||
if not output:
|
||||
|
||||
@@ -24,12 +24,13 @@ CONTEXT_VOLUME = 100
|
||||
|
||||
CHAT_DEPLOYMENT_ID = CHAT_MODEL_ID
|
||||
|
||||
save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_one_hop_true_false_binary_response.csv"
|
||||
save_name = "_".join(CHAT_MODEL_ID.split("-"))+"_kg_rag_based_true_false_binary_response.csv"
|
||||
|
||||
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
@@ -37,7 +38,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(row["text"], vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
enriched_prompt = "Context: "+ context + "\n" + "Question: "+ question
|
||||
output = get_GPT_response(enriched_prompt, SYSTEM_PROMPT, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=TEMPERATURE)
|
||||
answer_list.append((row["text"], row["label"], output))
|
||||
|
||||
@@ -31,6 +31,7 @@ SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
@@ -40,7 +41,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, context_volume, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
enriched_prompt = "Context: "+ context + "\n" + "Question: " + question
|
||||
output = get_GPT_response(enriched_prompt, system_prompt, CHAT_MODEL_ID, CHAT_DEPLOYMENT_ID, temperature=temperature)
|
||||
if not output:
|
||||
|
||||
@@ -22,7 +22,7 @@ MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
|
||||
BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
|
||||
CACHE_DIR = config_data["LLM_CACHE_DIR"]
|
||||
|
||||
save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_two_hop_mcq_from_monarch_and_robokop_response.csv"
|
||||
save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv"
|
||||
|
||||
|
||||
INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
|
||||
@@ -30,6 +30,7 @@ INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
|
||||
|
||||
@@ -43,7 +44,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
output = llm_chain.run(context=context, question=question)
|
||||
answer_list.append((row["text"], row["correct_node"], output))
|
||||
answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
|
||||
|
||||
61
kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py
Normal file
61
kg_rag/rag_based_generation/Llama/run_mcq_qa_medgpt.py
Normal file
@@ -0,0 +1,61 @@
|
||||
'''
|
||||
This script takes the MCQ style questions from the csv file and save the result as another csv file.
|
||||
This script makes use of Llama model.
|
||||
Before running this script, make sure to configure the filepaths in config.yaml file.
|
||||
'''
|
||||
|
||||
from langchain import PromptTemplate, LLMChain
|
||||
from kg_rag.utility import *
|
||||
|
||||
|
||||
QUESTION_PATH = config_data["MCQ_PATH"]
|
||||
SYSTEM_PROMPT = system_prompts["MCQ_QUESTION"]
|
||||
CONTEXT_VOLUME = int(config_data["CONTEXT_VOLUME"])
|
||||
QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data["QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD"])
|
||||
QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data["QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY"])
|
||||
VECTOR_DB_PATH = config_data["VECTOR_DB_PATH"]
|
||||
NODE_CONTEXT_PATH = config_data["NODE_CONTEXT_PATH"]
|
||||
SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL"]
|
||||
SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data["SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL"]
|
||||
SAVE_PATH = config_data["SAVE_RESULTS_PATH"]
|
||||
MODEL_NAME = 'PharMolix/BioMedGPT-LM-7B'
|
||||
BRANCH_NAME = 'main'
|
||||
CACHE_DIR = config_data["LLM_CACHE_DIR"]
|
||||
|
||||
save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_mcq_from_monarch_and_robokop_response.csv"
|
||||
|
||||
|
||||
INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
|
||||
|
||||
vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)
|
||||
embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)
|
||||
node_context_df = pd.read_csv(NODE_CONTEXT_PATH)
|
||||
edge_evidence = False
|
||||
|
||||
|
||||
def main():
|
||||
start_time = time.time()
|
||||
llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR)
|
||||
template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)
|
||||
prompt = PromptTemplate(template=template, input_variables=["context", "question"])
|
||||
llm_chain = LLMChain(prompt=prompt, llm=llm)
|
||||
question_df = pd.read_csv(QUESTION_PATH)
|
||||
question_df = question_df.sample(50, random_state=40)
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
output = llm_chain.run(context=context, question=question)
|
||||
answer_list.append((row["text"], row["correct_node"], output))
|
||||
answer_df = pd.DataFrame(answer_list, columns=["question", "correct_answer", "llm_answer"])
|
||||
answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True)
|
||||
print("Completed in {} min".format((time.time()-start_time)/60))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@@ -22,8 +22,9 @@ MODEL_NAME = config_data["LLAMA_MODEL_NAME"]
|
||||
BRANCH_NAME = config_data["LLAMA_MODEL_BRANCH"]
|
||||
CACHE_DIR = config_data["LLM_CACHE_DIR"]
|
||||
CONTEXT_VOLUME = 100
|
||||
edge_evidence = False
|
||||
|
||||
save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_one_hop_true_false_binary_response.csv"
|
||||
save_name = "_".join(MODEL_NAME.split("/")[-1].split("-"))+"_kg_rag_based_true_false_binary_response.csv"
|
||||
|
||||
|
||||
INSTRUCTION = "Context:\n\n{context} \n\nQuestion: {question}"
|
||||
@@ -43,7 +44,7 @@ def main():
|
||||
answer_list = []
|
||||
for index, row in question_df.iterrows():
|
||||
question = row["text"]
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY)
|
||||
context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)
|
||||
output = llm_chain.run(context=context, question=question)
|
||||
answer_list.append((row["text"], row["label"], output))
|
||||
answer_df = pd.DataFrame(answer_list, columns=["question", "label", "llm_answer"])
|
||||
|
||||
153
notebooks/disease_retrieval_accuracy.ipynb
Normal file
153
notebooks/disease_retrieval_accuracy.ipynb
Normal file
@@ -0,0 +1,153 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "945c420e-bb44-4ffb-b899-e049caf0d918",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.chdir('..')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f2bdefb3-3e59-409a-81b4-2e9ffbdfdb1a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from kg_rag.utility import *\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "19fc98b9-64a8-40c0-9e5a-92b4392e6969",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = pd.read_csv('data/dataset_for_entity_retrieval_accuracy_analysis.csv')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "2851be4c-2a76-4f6d-b5f4-118e8122b155",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"VECTOR_DB_PATH = config_data[\"VECTOR_DB_PATH\"]\n",
|
||||
"SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data[\"SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL\"]\n",
|
||||
"\n",
|
||||
"vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "7255fbab-d8b4-43a3-b870-9d67ad79d061",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"322it [00:05, 56.20it/s]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 4.74 s, sys: 896 ms, total: 5.64 s\n",
|
||||
"Wall time: 5.73 s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"\n",
|
||||
"correct_retrieval = 0\n",
|
||||
"\n",
|
||||
"for index, row in tqdm(data.iterrows()):\n",
|
||||
" question = row['text']\n",
|
||||
" entities = disease_entity_extractor_v2(question) \n",
|
||||
" for entity in entities:\n",
|
||||
" node_search_result = vectorstore.similarity_search_with_score(entity, k=1)\n",
|
||||
" if node_search_result[0][0].page_content == row['node_hits']:\n",
|
||||
" correct_retrieval += 1 \n",
|
||||
" break\n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "2f997335-bff7-431c-bbd8-608513eddcc7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Retrieval accuracy is 99.7%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"retrieval_accuracy = 100*correct_retrieval/data.shape[0]\n",
|
||||
"print(f'Retrieval accuracy is {round(retrieval_accuracy,1)}%')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "afe971ab-b8b9-4c88-9657-c588813b412f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@@ -2,28 +2,31 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 1,
|
||||
"id": "3d3dca32-b77f-471d-b834-20ac795f9f17",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.chdir('..')"
|
||||
"os.chdir('..')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 2,
|
||||
"id": "9da344d2-8e45-4574-aa19-4ad76c566101",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from kg_rag.utility import *\n"
|
||||
"from IPython.display import clear_output\n",
|
||||
"from kg_rag.utility import *\n",
|
||||
"\n",
|
||||
"clear_output()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"execution_count": 3,
|
||||
"id": "b44bf274-41d1-4153-a65e-bfb9b90ebcc6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -57,7 +60,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"execution_count": 4,
|
||||
"id": "33c0771d-e6be-406b-9b17-51f6377bcb6a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -77,7 +80,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"id": "bbbdb428-6d01-43f2-9e58-b919e7a68736",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -231,18 +234,18 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 113,
|
||||
"execution_count": 13,
|
||||
"id": "2a1c9337-fd39-45b0-b12a-6de9b5971b9e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"question = 'Which drugs are contraindicated in obsessive compulsive disorder?'\n"
|
||||
"question = 'Does drug dependence have any genetic factors? Do you have any statistical evidence from trustworthy sources for this?'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 114,
|
||||
"execution_count": 14,
|
||||
"id": "e6852cb3-8bf9-408b-ab65-492b75c690ed",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -250,7 +253,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The drugs that are contraindicated in obsessive-compulsive disorder include Orphenadrine, PHENYLPROPANOLAMINE POLISTIREX, CHLORPHENIRAMINE POLISTIREX, CODEINE POLISTIREX, Ibuprofen Lysine, Benzoic Acid, (R)-3-(1-Hydroxy-2-(methylamino)ethyl)phenol 2,3-dihydroxysuccinate, Dexibuprofen, Phenacetin, Pheniramine Maleate, Pyrilamine, Butalbital, Propoxyphene, Ibuprofen, Orphenadrine Citrate, Phenobarbital, Caffeine, Caffeine Citrate, 2-Amino-1-phenyl-1-propanol, Thiamine, Dihydrocodeine tartrate, Chlorpheniramine, (1S,2R)-1-benzyl-3-(dimethylamino)-2-methyl-1-phenylpropyl propanoate naphtalene-2-sulfonic acid, Phenylephrine, Ergotamine, Codeine, Riboflavin, and Pheniramine. The provenance of this information is DrugCentral.\n",
|
||||
"Yes, drug dependence does have genetic factors. This is evidenced by the association of drug dependence with genes KAT2B and SLC25A16. The statistical evidence comes from Genome-Wide Association Studies (GWAS), with p-values of 4e-10 and 1e-09 respectively, indicating a statistically significant association.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@@ -264,7 +267,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 115,
|
||||
"execution_count": 15,
|
||||
"id": "b8079bc6-d309-4c88-9440-376aa43d972e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -272,7 +275,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"There are no specific drugs that are universally contraindicated in obsessive-compulsive disorder (OCD). However, certain medications like benzodiazepines and atypical antipsychotics may potentially worsen OCD symptoms. The choice of medication always depends on the individual's overall health, the severity of their symptoms, and their response to treatment.\n",
|
||||
"Yes, drug dependence does have genetic factors. According to the National Institute on Drug Abuse, genetics account for about 40-60% of a person's vulnerability to drug addiction.\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
@@ -286,7 +289,7 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d44098cc-ca4c-4ffa-a5ca-e273a029cb67",
|
||||
"id": "16e1ae6d-d0e0-4b42-a8fe-517033fb9960",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
|
||||
2
pid_info.sh
Normal file
2
pid_info.sh
Normal file
@@ -0,0 +1,2 @@
|
||||
ps -eo pid,lstart,comm,etime | grep 20342
|
||||
ps -eo pid,lstart,comm,etime | grep 9628
|
||||
@@ -30,30 +30,30 @@ KG_RAG_BASED_TEXT_GENERATION: |
|
||||
TRUE_FALSE_QUESTION: |
|
||||
You are an expert biomedical researcher. For answering the Question at the end, you need to first read the Context provided.
|
||||
Based on that Context, provide your answer in the following JSON format:
|
||||
{
|
||||
{{
|
||||
"answer": "True"
|
||||
}
|
||||
}}
|
||||
OR
|
||||
{
|
||||
{{
|
||||
"answer": "False"
|
||||
}
|
||||
}}
|
||||
TRUE_FALSE_QUESTION_PROMPT_BASED: |
|
||||
You are an expert biomedical researcher. Please provide your answer in the following JSON format for the Question asked:
|
||||
{
|
||||
{{
|
||||
"answer": "True"
|
||||
}
|
||||
}}
|
||||
OR
|
||||
{
|
||||
{{
|
||||
"answer": "False"
|
||||
}
|
||||
}}
|
||||
|
||||
# MCQ Question
|
||||
MCQ_QUESTION: |
|
||||
You are an expert biomedical researcher. For answering the Question at the end, you need to first read the Context provided.
|
||||
Based on that Context, provide your answer in the following JSON format for the Question asked.
|
||||
{
|
||||
{{
|
||||
"answer": <correct answer>
|
||||
}
|
||||
}}
|
||||
MCQ_QUESTION_PROMPT_BASED: |
|
||||
You are an expert biomedical researcher. Please provide your answer in the following JSON format for the Question asked:
|
||||
{{
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:24<00:00, 68.06s/it]\n",
|
||||
"Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00, 6.56s/it]\n",
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
|
||||
" warnings.warn(\n",
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
|
||||
@@ -53,8 +53,8 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 25.6 s, sys: 51.1 s, total: 1min 16s\n",
|
||||
"Wall time: 3min 31s\n"
|
||||
"CPU times: user 18.9 s, sys: 26.3 s, total: 45.3 s\n",
|
||||
"Wall time: 26 s\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -74,7 +74,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 7,
|
||||
"id": "0a28f1ce-5cc5-4a17-84d7-b0dda29815a5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@@ -84,12 +84,12 @@
|
||||
"Question:alpha-Mannosidosis associates Gene MAN2B1\n",
|
||||
"Answer:\n",
|
||||
"'''\n",
|
||||
"text = ['Is it PNPLA3 or HLA-B that has a significant association with the disease liver benign neoplasm?']\n"
|
||||
"text = [\"Out of the given list, which Gene is associated with psoriasis and Takayasu's arteritis. Given list is: SHTN1, HLA-B, SLC14A2, BTBD9, DTNB\"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"id": "f59eeb37-57dd-42ae-b9ff-f8442eb613a7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -97,8 +97,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"A meta-analysis of 11,000 patients with hepatocellular carcinoma.\n",
|
||||
"Hepatocellular carcinoma (HCC) is the most common primary liver cancer and the third leading cause of cancer-related deaths worldwide. D The incidence of\n"
|
||||
"P1, TNFAIP3, TNIP1, TNIP3, TNFAIP2, TNFAIP6, TNFAIP7, TNFAIP8, TNFAIP9, TNFAIP10\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
||||
195
test_notebooks/medgpt_rag_test.ipynb
Normal file
195
test_notebooks/medgpt_rag_test.ipynb
Normal file
@@ -0,0 +1,195 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b33a915d-cc1d-4102-a2ee-159c02e6c579",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"os.chdir('..')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "57c0a1b8-e339-4f6b-941e-7af7b902de7c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain import PromptTemplate, LLMChain\n",
|
||||
"from kg_rag.utility import *\n",
|
||||
"from tqdm import tqdm\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "2672548d-7d25-4f3c-94d1-d19206049076",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"QUESTION_PATH = config_data[\"MCQ_PATH\"]\n",
|
||||
"SYSTEM_PROMPT = system_prompts[\"MCQ_QUESTION\"]\n",
|
||||
"CONTEXT_VOLUME = int(config_data[\"CONTEXT_VOLUME\"])\n",
|
||||
"QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD = float(config_data[\"QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD\"])\n",
|
||||
"QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY = float(config_data[\"QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY\"])\n",
|
||||
"VECTOR_DB_PATH = config_data[\"VECTOR_DB_PATH\"]\n",
|
||||
"NODE_CONTEXT_PATH = config_data[\"NODE_CONTEXT_PATH\"]\n",
|
||||
"SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL = config_data[\"SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL\"]\n",
|
||||
"SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL = config_data[\"SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL\"]\n",
|
||||
"SAVE_PATH = config_data[\"SAVE_RESULTS_PATH\"]\n",
|
||||
"\n",
|
||||
"MODEL_NAME = 'PharMolix/BioMedGPT-LM-7B'\n",
|
||||
"BRANCH_NAME = 'main'\n",
|
||||
"CACHE_DIR = config_data[\"LLM_CACHE_DIR\"]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "c753b053-be44-4ddb-8d55-3bf434428954",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"INSTRUCTION = \"Context:\\n\\n{context} \\n\\nQuestion: {question}\"\n",
|
||||
"\n",
|
||||
"vectorstore = load_chroma(VECTOR_DB_PATH, SENTENCE_EMBEDDING_MODEL_FOR_NODE_RETRIEVAL)\n",
|
||||
"embedding_function_for_context_retrieval = load_sentence_transformer(SENTENCE_EMBEDDING_MODEL_FOR_CONTEXT_RETRIEVAL)\n",
|
||||
"node_context_df = pd.read_csv(NODE_CONTEXT_PATH)\n",
|
||||
"edge_evidence = False\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f18c9efb-556c-4b37-8b00-e06a73a19f86",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:19<00:00, 6.66s/it]\n",
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
|
||||
" warnings.warn(\n",
|
||||
"/root/anaconda3/envs/kg_rag_test_2/lib/python3.10/site-packages/transformers/generation/configuration_utils.py:367: UserWarning: `do_sample` is set to `False`. However, `top_p` is set to `0.6` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"llm = llama_model(MODEL_NAME, BRANCH_NAME, CACHE_DIR) \n",
|
||||
"template = get_prompt(INSTRUCTION, SYSTEM_PROMPT)\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"context\", \"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0370d703-4e18-4c78-9e9a-2030b498253e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm) \n",
|
||||
"question_df = pd.read_csv(QUESTION_PATH) \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "275f4171-3be7-46ca-bf16-18160ce72f3b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\"Out of the given list, which Gene is associated with psoriasis and Takayasu's arteritis. Given list is: SHTN1, HLA-B, SLC14A2, BTBD9, DTNB\""
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"question_df.iloc[0].text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "cc5a65fb-6bd3-4948-84e5-f404af83d3f7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (4135 > 2048). Running this sequence through the model will result in indexing errors\n",
|
||||
"This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.\n",
|
||||
"0it [04:19, ?it/s]\n",
|
||||
"\n",
|
||||
"KeyboardInterrupt\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"\n",
|
||||
"answer_list = []\n",
|
||||
"question_df = question_df.sample(50, random_state=40)\n",
|
||||
"for index, row in tqdm(question_df.iterrows()):\n",
|
||||
" question = row[\"text\"]\n",
|
||||
" context = retrieve_context(question, vectorstore, embedding_function_for_context_retrieval, node_context_df, CONTEXT_VOLUME, QUESTION_VS_CONTEXT_SIMILARITY_PERCENTILE_THRESHOLD, QUESTION_VS_CONTEXT_MINIMUM_SIMILARITY, edge_evidence)\n",
|
||||
" output = llm_chain.run(context=context, question=question)\n",
|
||||
" print(output)\n",
|
||||
" input('press enter')\n",
|
||||
" answer_list.append((row[\"text\"], row[\"correct_node\"], output))\n",
|
||||
"answer_df = pd.DataFrame(answer_list, columns=[\"question\", \"correct_answer\", \"llm_answer\"])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "94eb325d-17d4-4013-907d-7a38dabaea56",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"answer_df.to_csv(os.path.join(SAVE_PATH, save_name), index=False, header=True) \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user