Replace pycorenlp by Stanza to fix Unicode encoding issues.

This commit is contained in:
Lieven Govaerts
2020-12-30 12:17:22 +01:00
parent ce8ea7055a
commit 071f38b243
5 changed files with 18 additions and 8 deletions

4
.idea/encodings.xml generated Normal file
View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@@ -1,7 +1,7 @@
import logging
import nltk
from pycorenlp import StanfordCoreNLP
from stanza.server import CoreNLPClient, StartServer
class Preprocessor:
@@ -18,10 +18,9 @@ class Preprocessor:
self.log = logging.getLogger('GiveMe5W')
# connect to CoreNLP server
if host is None:
self.cnlp = StanfordCoreNLP("http://localhost:9000")
else:
self.cnlp = StanfordCoreNLP(host)
host = "http://localhost:9000" if host is None else host
self.cnlp = CoreNLPClient(endpoint=host,
start_server = StartServer.DONT_START)
# define basic base_config and desired processing pipeline
self.base_config = {
@@ -109,7 +108,8 @@ class Preprocessor:
:return Document: The processed Document object.
"""
actual_config = self._build_actual_config(document)
annotation = self.cnlp.annotate(document.get_full_text(), actual_config)
annotation = self.cnlp.annotate(text=document.get_full_text(),
properties = actual_config)
if type(annotation) is str:
print(annotation)

View File

@@ -9,7 +9,7 @@ numpy==1.14.3
pandas==0.22.0
parsedatetime==2.4
plotly==2.5.1
pycorenlp==0.3.0
stanza>=1.1.1
spacy==2.0.11
Twisted==19.7.0
typing==3.6.4

View File

@@ -49,7 +49,7 @@ setup(name='giveme5w1h',
'pandas',
'parsedatetime',
'plotly',
'pycorenlp',
'stanza',
'spacy',
'Twisted',
'typing',