reorganize classifcation by calling external LLMs

2024-02-15 23:10:01 +03:00 · 2023-10-08 05:32:28 -04:00
parent 6acd4a495c
commit 9bf8a5ffe4
4 changed files with 42 additions and 41 deletions
--- a/fingpt/FinGPT-RAG/multisource_retrieval/data/test.csv
+++ b/fingpt/FinGPT-RAG/multisource_retrieval/data/test.csv
@@ -1,4 +1,4 @@
-sentence,snippets,target,sentiment_score,aspects,format,label
+sentence,snippets,target,sentiment_score,aspects,format,label, contextualized_sentence
 $HCP Come to the party and buy this -gonna give solid gains and a dividend $$$$$$,['gonna give solid gains and a dividend'],HCP,0.52,['Corporate/Dividend Policy'],post,0
@gakrum nice chart shows distinctive down channel not a dip.. where do you see the bottom? $SPY ..$150? ..$130?,['chart shows distinctive down channel'],SPY,-0.443,['Stock/Technical Analysis'],post,2
 Japan's Asahi to submit bid next week for SABMiller's Grolsch and Peroni - Yomiuri,['to submit bid next week'],SABMiller,0.236,['Stock/Buyside/Stock Buyside'],headline,0
--- a/fingpt/FinGPT-RAG/multisource_retrieval/external_LLMs/external_LLMs.py
+++ b/fingpt/FinGPT-RAG/multisource_retrieval/external_LLMs/external_LLMs.py
@@ -0,0 +1,36 @@
+# Classification methods:
+def extract_classification(text, classification_prompt):
+    print("Extracting classification for", text)
+    api_key = os.getenv('OPENAI_API_KEY')
+    api_url = "https://api.openai.com/v1/chat/completions"
+
+    headers = {
+        'Content-Type': 'application/json',
+        'Authorization': f'Bearer {api_key}',
+    }
+
+    payload = {
+        'model': 'gpt-3.5-turbo',
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a financial analyst."
+            },
+            {
+                "role": "user",
+                "content": text + classification_prompt,
+            }
+        ],
+    }
+
+    print("Sending request to", api_url, "with payload", payload)
+
+    try:
+        response = requests.post(api_url, headers=headers, json=payload)
+        json_data = response.json()
+        print("json data", json_data)
+        classification_response = json_data[0]['text'].strip()
+        print("Classification response:", classification_response)
+        return classification_response
+    except requests.exceptions.RequestException as e:
+        print(f"Request error: {e}")
--- a/fingpt/FinGPT-RAG/multisource_retrieval/external_LLMs/g4f_tokenization.py
+++ b/fingpt/FinGPT-RAG/multisource_retrieval/external_LLMs/g4f_tokenization.py
--- a/fingpt/FinGPT-RAG/multisource_retrieval/news_scraper.py
+++ b/fingpt/FinGPT-RAG/multisource_retrieval/news_scraper.py
@@ -21,6 +21,7 @@ from searchtweets import load_credentials
 # From src/
 import requests_url
 from requests_url import requests_get
+from external_LLMs import external_LLMs
 from scrapers.yahoo import scrape_yahoo
 from sentence_processing.split_sentence import split_sentence
 from scrapers.cnbc import scrape_cnbc
@@ -43,45 +44,6 @@ twitter_bearer_token = os.getenv("TWITTER_BEARER_TOKEN")
 # auth = tweepy.OAuth1UserHandler(twitter_api_key, twitter_api_key_secret, twitter_access_token, twitter_access_token_secret)
 # api = tweepy.API(auth)

-
-
-# Classification methods:
-def extract_classification(text, classification_prompt):
-    print("Extracting classification for", text)
-    api_key = os.getenv('OPENAI_API_KEY')
-    api_url = "https://api.openai.com/v1/chat/completions"
-
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {api_key}',
-    }
-
-    payload = {
-        'model': 'gpt-3.5-turbo',
-        "messages": [
-            {
-                "role": "system",
-                "content": "You are a financial analyst."
-            },
-            {
-                "role": "user",
-                "content": text + classification_prompt,
-            }
-        ],
-    }
-
-    print("Sending request to", api_url, "with payload", payload)
-
-    try:
-        response = requests.post(api_url, headers=headers, json=payload)
-        json_data = response.json()
-        print("json data", json_data)
-        classification_response = json_data[0]['text'].strip()
-        print("Classification response:", classification_response)
-        return classification_response
-    except requests.exceptions.RequestException as e:
-        print(f"Request error: {e}")
-
 # Scraping methods:
 def url_encode_string(input_string):
    encoded_string = urllib.parse.quote(input_string)
@@ -702,7 +664,7 @@ def select_column_and_classify():

            for row_index, row in df.iloc[1:].iterrows():
                target_sentence = row[sentence_column]
-                classification_response = extract_classification(target_sentence, classification_prompt)
+                classification_response = external_LLMs.extract_classification(target_sentence, classification_prompt)
                df.at[row_index, "classification"] = classification_response  # Assign classification response to the new column

            output_file_path = os.path.splitext(file_path)[0] + "_classified.csv"
@@ -719,13 +681,16 @@ def select_column_and_classify():
        context_choice = gui.ynbox("Do you want to research the context for this news?", "Context Research")
        process_existing_file = gui.ynbox("Do you want process an existing file?", "Context Research")
        if context_choice:
+            print("cp 1")
            file_path = gui.fileopenbox("Select the CSV file containing news for context research", filetypes=["*.csv"])
            df = pd.read_csv(file_path)
            column_names = df.columns.tolist()
+            print("cp 2")
            if not process_existing_file:
                df["link"] = ""  # Create a new column named "link"
                df["contextualized_sentence"] = ""  # Create a new column named "contextualized sentence"

+
            if file_path:
                sentence_column = gui.buttonbox("Column Selection", "Select the column for target sentence in the CSV:",
                                                choices=column_names)