Replace pycorenlp by Stanza to fix Unicode encoding issues.

2021-08-01 22:47:51 +03:00 · 2020-12-30 12:17:22 +01:00
parent ce8ea7055a
commit 071f38b243
5 changed files with 18 additions and 8 deletions
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/Giveme5W1H/extractor/preprocessors/preprocessor_core_nlp.py
+++ b/Giveme5W1H/extractor/preprocessors/preprocessor_core_nlp.py
@@ -1,7 +1,7 @@
 import logging

 import nltk
-from pycorenlp import StanfordCoreNLP
+from stanza.server import CoreNLPClient, StartServer


 class Preprocessor:
@@ -18,10 +18,9 @@ class Preprocessor:
        self.log = logging.getLogger('GiveMe5W')

        # connect to CoreNLP server
-        if host is None:
-            self.cnlp = StanfordCoreNLP("http://localhost:9000")
-        else:
-            self.cnlp = StanfordCoreNLP(host)
+        host = "http://localhost:9000" if host is None else host
+        self.cnlp = CoreNLPClient(endpoint=host,
+                                  start_server = StartServer.DONT_START)

        # define basic base_config and desired processing pipeline
        self.base_config = {
@@ -109,7 +108,8 @@ class Preprocessor:
        :return Document: The processed Document object.
        """
        actual_config = self._build_actual_config(document)
-        annotation = self.cnlp.annotate(document.get_full_text(), actual_config)
+        annotation = self.cnlp.annotate(text=document.get_full_text(),
+                                        properties = actual_config)

        if type(annotation) is str:
            print(annotation)
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ numpy==1.14.3
 pandas==0.22.0
 parsedatetime==2.4
 plotly==2.5.1
-pycorenlp==0.3.0
+stanza>=1.1.1
 spacy==2.0.11
 Twisted==19.7.0
 typing==3.6.4
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@ setup(name='giveme5w1h',
          'pandas',
          'parsedatetime',
          'plotly',
-          'pycorenlp',
+          'stanza',
          'spacy',
          'Twisted',
          'typing',