Init commit

2022-03-14 19:18:06 +03:00 · 2020-10-22 14:21:37 +02:00
commit 5d408d7db1
20 changed files with 407 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.ipynb linguist-documentation
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,77 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Sphinx documentation
+docs/_build/
+
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+.idea
+.idea/
--- a/21
+++ b/21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020, Maarten P. Grootendorst
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/19
+++ b/19
@@ -0,0 +1,19 @@
+test:
+	pytest
+
+install:
+	python -m pip install -e .
+
+install-test:
+	python -m pip install -e ".[test]"
+	python -m pip install -e ".[all]"
+
+pypi:
+	python setup.py sdist
+	python setup.py bdist_wheel --universal
+	twine upload dist/*
+
+clean:
+	rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
+
+check: test clean
--- a/README.md
+++ b/README.md
@@ -0,0 +1,6 @@
+[![PyPI - Python](https://img.shields.io/badge/python-3.6%20|%203.7%20|%203.8-blue.svg)](https://pypi.org/project/keybert/)
+[![PyPI - License](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/MaartenGr/keybert/blob/master/LICENSE)
+[![PyPI - PyPi](https://img.shields.io/pypi/v/keyBERT)](https://pypi.org/project/keybert/)
+[![Build](https://img.shields.io/github/workflow/status/MaartenGr/keyBERT/Code%20Checks/master)](https://pypi.org/project/keybert/)
+
+# KeyBERT
--- a/docs/algorithm.md
+++ b/docs/algorithm.md
@@ -0,0 +1 @@
+# The Algorithm
--- a/docs/api/keybert.md
+++ b/docs/api/keybert.md
@@ -0,0 +1,3 @@
+# `KeyBERT`
+
+::: keybert.model.KeyBERT
--- a/docs/img/icon.png
+++ b/docs/img/icon.png
--- a/docs/index.md
+++ b/docs/index.md
@@ -0,0 +1 @@
+# KeyBERT
--- a/docs/style.css
+++ b/docs/style.css
--- a/images/icon.png
+++ b/images/icon.png
--- a/images/logo.png
+++ b/images/logo.png
--- a/keybert/init.py
+++ b/keybert/init.py
--- a/keybert/model.py
+++ b/keybert/model.py
@@ -0,0 +1,143 @@
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.feature_extraction.text import CountVectorizer
+from tqdm import tqdm
+from typing import List, Union
+import warnings
+
+
+class KeyBERT:
+    def __init__(self, model: str = 'distilbert-base-nli-mean-tokens'):
+        self.model = SentenceTransformer(model)
+        self.doc_embeddings = None
+
+    def extract_keywords(self,
+                         docs: Union[str, List[str]],
+                         keyphrase_length: int = 1,
+                         stop_words: Union[str, List[str]] = 'english',
+                         top_n: int = 5,
+                         min_df: int = 1) -> Union[List[str], List[List[str]]]:
+        """ Extract keywords/keyphrases
+
+        NOTE: I would advise you to use
+
+        Single Document:
+
+
+        Multiple Documents:
+            There is an option to extract keywords for multiple documents
+            that is faster than extraction for multiple single documents.
+
+            However...this method assumes that you can keep the word embeddings
+            for all words in the vocabulary in memory which might be troublesome.
+
+            I would advise against using this option and simply iterating
+            over documents instead if you have limited hardware.
+
+        Arguments:
+            docs: The document(s) for which to extract keywords/keyphrases
+            keyphrase_length: Length, in words, of the extracted keywords/keyphrases
+            stop_words: Stopwords to remove from the document
+            top_n: Return the top n keywords/keyphrases
+            min_df: Minimum document frequency of a word across all documents
+                    if keywords for multiple documents need to be extracted
+
+        Returns:
+            keywords: The top n keywords for a document
+
+        """
+
+        if isinstance(docs, str):
+            return self._extract_keywords_single_doc(docs,
+                                                     keyphrase_length,
+                                                     stop_words,
+                                                     top_n)
+        elif isinstance(docs, list):
+            warnings.warn("Although extracting keywords for multiple documents is faster "
+                          "than iterating over single documents, it requires significant memory "
+                          "to hold all word embeddings. Use this at your own discretion!")
+            return self._extract_keywords_multiple_docs(docs,
+                                                        keyphrase_length,
+                                                        stop_words,
+                                                        top_n,
+                                                        min_df=min_df)
+
+    def _extract_keywords_single_doc(self,
+                                     doc: str,
+                                     keyphrase_length: int = 1,
+                                     stop_words: Union[str, List[str]] = 'english',
+                                     top_n: int = 5) -> List[str]:
+        """ Extract keywords/keyphrases for a single document
+
+        Arguments:
+            doc: The document for which to extract keywords/keyphrases
+            keyphrase_length: Length, in words, of the extracted keywords/keyphrases
+            stop_words: Stopwords to remove from the document
+            top_n: Return the top n keywords/keyphrases
+
+        Returns:
+            keywords: The top n keywords for a document
+
+        """
+        try:
+            # Extract Words
+            n_gram_range = (keyphrase_length, keyphrase_length)
+            count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
+            words = count.get_feature_names()
+
+            # Extract Embeddings
+            doc_embeddings = self.model.encode([doc])
+            word_embeddings = self.model.encode(words)
+
+            # Calculate distances and extract keywords
+            distances = cosine_similarity(doc_embeddings, word_embeddings)
+            keywords = [words[index] for index in distances.argsort()[0][-top_n:]]
+
+            return keywords[::-1]
+        except ValueError:
+            return []
+
+    def _extract_keywords_multiple_docs(self,
+                                        docs: List[str],
+                                        keyphrase_length: int = 1,
+                                        stop_words: str = 'english',
+                                        top_n: int = 5,
+                                        min_df: int = 1):
+        """ Extract keywords/keyphrases for a multiple documents
+
+        Arguments:
+            docs: The document for which to extract keywords/keyphrases
+            keyphrase_length: Length, in words, of the extracted keywords/keyphrases
+            stop_words: Stopwords to remove from the document
+            top_n: Return the top n keywords/keyphrases
+            min_df: The minimum frequency of words
+
+        Returns:
+            keywords: The top n keywords for a document
+
+        """
+        # Extract words
+        n_gram_range = (keyphrase_length, keyphrase_length)
+        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words, min_df=min_df).fit(docs)
+        words = count.get_feature_names()
+        df = count.transform(docs)
+
+        # Extract embeddings
+        word_embeddings = self.model.encode(words, show_progress_bar=True)
+        doc_embeddings = self.model.encode(docs, show_progress_bar=True)
+
+        # Extract keywords
+        keywords = []
+        for index, doc in tqdm(enumerate(docs)):
+            doc_words = [words[i] for i in df[index].nonzero()[1]]
+
+            if doc_words:
+                doc_word_embeddings = np.array([word_embeddings[i] for i in df[index].nonzero()[1]])
+                distances = cosine_similarity([doc_embeddings[index]], doc_word_embeddings)[0]
+                doc_keywords = [doc_words[i] for i in distances.argsort()[-top_n:]]
+                keywords.append(doc_keywords)
+            else:
+                keywords.append(["None Found"])
+
+        return keywords
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -0,0 +1,42 @@
+site_name: KeyBERT
+extra_css: [style.css]
+repo_url: https://github.com/MaartenGr/keyBERT
+site_url: https://maartengr.github.io/keyBERT/
+site_description: Leveraging BERT to extract important keywords
+site_author: Maarten P. Grootendorst
+use_directory_urls: false
+nav:
+  - Home:
+    - Index: index.md
+  - API:
+      - KeyBERT: api/keybert.md
+plugins:
+  - mkdocstrings:
+      watch:
+        - keybert
+  - search
+copyright: Copyright &copy; 2020 Maintained by <a href="https://github.com/MaartenGr">Maarten</a>.
+theme:
+  custom_dir: images/
+  name: material
+  icon:
+    logo: material/library
+  font:
+    text: Ubuntu
+    code: Ubuntu Mono
+  favicon: icon.png
+  logo: icon.png
+  feature:
+    tabs: true
+  palette:
+    primary: indigo
+    accent: blue
+markdown_extensions:
+  - codehilite
+  - pymdownx.inlinehilite
+  - pymdownx.details
+  - pymdownx.tabbed
+  - pymdownx.highlight:
+      use_pygments: true
+  - toc:
+      permalink: true
--- a/setup.py
+++ b/setup.py
@@ -0,0 +1,58 @@
+import setuptools
+
+test_packages = [
+    "pytest>=5.4.3",
+    "pytest-cov>=2.6.1"
+]
+
+base_packages = [
+    "sentence-transformers>=0.3.8",
+    "scikit-learn>=0.22.2",
+    "numpy>=1.18.5",
+]
+
+docs_packages = [
+    "mkdocs==1.1",
+    "mkdocs-material==4.6.3",
+    "mkdocstrings==0.8.0",
+]
+
+dev_packages = docs_packages + test_packages
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="keybert",
+    packages=["keybert"],
+    version="0.0.1",
+    author="Maarten Grootendorst",
+    author_email="maartengrootendorst@gmail.com",
+    description="KeyBERT performs keyword extraction with state-of-the-art transformer models.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/MaartenGr/keyBERT",
+    keywords="nlp bert keyword extraction embeddings",
+    classifiers=[
+        "Programming Language :: Python",
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Developers",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "License :: OSI Approved :: MIT License",
+        "Topic :: Scientific/Engineering",
+        "Operating System :: Microsoft :: Windows",
+        "Operating System :: POSIX",
+        "Operating System :: Unix",
+        "Operating System :: MacOS",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.8",
+    ],
+    install_requires=base_packages,
+    extras_require={
+        "test": test_packages,
+        "docs": docs_packages,
+        "dev": dev_packages,
+    },
+    python_requires='>=3.6',
+)
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -0,0 +1,15 @@
+# from bertopic import BERTopic
+# import pytest
+#
+#
+# @pytest.fixture(scope="module")
+# def base_bertopic():
+#     model = BERTopic(bert_model='distilbert-base-nli-mean-tokens',
+#                      top_n_words=20,
+#                      nr_topics=None,
+#                      n_gram_range=(1, 1),
+#                      min_topic_size=30,
+#                      n_neighbors=15,
+#                      n_components=5,
+#                      verbose=False)
+#     return model
--- a/tests/test_model.py
+++ b/tests/test_model.py
@@ -0,0 +1,20 @@
+# import pytest
+# import numpy as np
+# import pandas as pd
+# from unittest import mock
+#
+# from sklearn.datasets import fetch_20newsgroups, make_blobs
+# from keybert import KeyBERT
+#
+# newsgroup_docs = fetch_20newsgroups(subset='all')['data'][:1000]
+#
+# @mock.patch("bertopic.model.BERTopic._extract_embeddings")
+# def test_fit_transform(embeddings, base_bertopic):
+#     """ Test whether predictions are correctly made """
+#     blobs, _ = make_blobs(n_samples=len(newsgroup_docs), centers=5, n_features=768, random_state=42)
+#     embeddings.return_value = blobs
+#     predictions = base_bertopic.fit_transform(newsgroup_docs)
+#
+#     assert isinstance(predictions, list)
+#     assert len(predictions) == len(newsgroup_docs)
+#     assert not set(predictions).difference(set(base_bertopic.get_topics().keys()))
--- a/theme/style.css
+++ b/theme/style.css