Init commit

This commit is contained in:
MaartenGr
2020-10-22 14:21:37 +02:00
commit 5d408d7db1
20 changed files with 407 additions and 0 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
*.ipynb linguist-documentation

77
.gitignore vendored Normal file
View File

@@ -0,0 +1,77 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Sphinx documentation
docs/_build/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
.idea
.idea/

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020, Maarten P. Grootendorst
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

19
Makefile Normal file
View File

@@ -0,0 +1,19 @@
test:
pytest
install:
python -m pip install -e .
install-test:
python -m pip install -e ".[test]"
python -m pip install -e ".[all]"
pypi:
python setup.py sdist
python setup.py bdist_wheel --universal
twine upload dist/*
clean:
rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
check: test clean

6
README.md Normal file
View File

@@ -0,0 +1,6 @@
[![PyPI - Python](https://img.shields.io/badge/python-3.6%20|%203.7%20|%203.8-blue.svg)](https://pypi.org/project/keybert/)
[![PyPI - License](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/MaartenGr/keybert/blob/master/LICENSE)
[![PyPI - PyPi](https://img.shields.io/pypi/v/keyBERT)](https://pypi.org/project/keybert/)
[![Build](https://img.shields.io/github/workflow/status/MaartenGr/keyBERT/Code%20Checks/master)](https://pypi.org/project/keybert/)
# KeyBERT

1
docs/algorithm.md Normal file
View File

@@ -0,0 +1 @@
# The Algorithm

3
docs/api/keybert.md Normal file
View File

@@ -0,0 +1,3 @@
# `KeyBERT`
::: keybert.model.KeyBERT

BIN
docs/img/icon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

1
docs/index.md Normal file
View File

@@ -0,0 +1 @@
# KeyBERT

0
docs/style.css Normal file
View File

BIN
images/icon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
images/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

0
keybert/__init__.py Normal file
View File

143
keybert/model.py Normal file
View File

@@ -0,0 +1,143 @@
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
from typing import List, Union
import warnings
class KeyBERT:
def __init__(self, model: str = 'distilbert-base-nli-mean-tokens'):
self.model = SentenceTransformer(model)
self.doc_embeddings = None
def extract_keywords(self,
docs: Union[str, List[str]],
keyphrase_length: int = 1,
stop_words: Union[str, List[str]] = 'english',
top_n: int = 5,
min_df: int = 1) -> Union[List[str], List[List[str]]]:
""" Extract keywords/keyphrases
NOTE: I would advise you to use
Single Document:
Multiple Documents:
There is an option to extract keywords for multiple documents
that is faster than extraction for multiple single documents.
However...this method assumes that you can keep the word embeddings
for all words in the vocabulary in memory which might be troublesome.
I would advise against using this option and simply iterating
over documents instead if you have limited hardware.
Arguments:
docs: The document(s) for which to extract keywords/keyphrases
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
stop_words: Stopwords to remove from the document
top_n: Return the top n keywords/keyphrases
min_df: Minimum document frequency of a word across all documents
if keywords for multiple documents need to be extracted
Returns:
keywords: The top n keywords for a document
"""
if isinstance(docs, str):
return self._extract_keywords_single_doc(docs,
keyphrase_length,
stop_words,
top_n)
elif isinstance(docs, list):
warnings.warn("Although extracting keywords for multiple documents is faster "
"than iterating over single documents, it requires significant memory "
"to hold all word embeddings. Use this at your own discretion!")
return self._extract_keywords_multiple_docs(docs,
keyphrase_length,
stop_words,
top_n,
min_df=min_df)
def _extract_keywords_single_doc(self,
doc: str,
keyphrase_length: int = 1,
stop_words: Union[str, List[str]] = 'english',
top_n: int = 5) -> List[str]:
""" Extract keywords/keyphrases for a single document
Arguments:
doc: The document for which to extract keywords/keyphrases
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
stop_words: Stopwords to remove from the document
top_n: Return the top n keywords/keyphrases
Returns:
keywords: The top n keywords for a document
"""
try:
# Extract Words
n_gram_range = (keyphrase_length, keyphrase_length)
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
words = count.get_feature_names()
# Extract Embeddings
doc_embeddings = self.model.encode([doc])
word_embeddings = self.model.encode(words)
# Calculate distances and extract keywords
distances = cosine_similarity(doc_embeddings, word_embeddings)
keywords = [words[index] for index in distances.argsort()[0][-top_n:]]
return keywords[::-1]
except ValueError:
return []
def _extract_keywords_multiple_docs(self,
docs: List[str],
keyphrase_length: int = 1,
stop_words: str = 'english',
top_n: int = 5,
min_df: int = 1):
""" Extract keywords/keyphrases for a multiple documents
Arguments:
docs: The document for which to extract keywords/keyphrases
keyphrase_length: Length, in words, of the extracted keywords/keyphrases
stop_words: Stopwords to remove from the document
top_n: Return the top n keywords/keyphrases
min_df: The minimum frequency of words
Returns:
keywords: The top n keywords for a document
"""
# Extract words
n_gram_range = (keyphrase_length, keyphrase_length)
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words, min_df=min_df).fit(docs)
words = count.get_feature_names()
df = count.transform(docs)
# Extract embeddings
word_embeddings = self.model.encode(words, show_progress_bar=True)
doc_embeddings = self.model.encode(docs, show_progress_bar=True)
# Extract keywords
keywords = []
for index, doc in tqdm(enumerate(docs)):
doc_words = [words[i] for i in df[index].nonzero()[1]]
if doc_words:
doc_word_embeddings = np.array([word_embeddings[i] for i in df[index].nonzero()[1]])
distances = cosine_similarity([doc_embeddings[index]], doc_word_embeddings)[0]
doc_keywords = [doc_words[i] for i in distances.argsort()[-top_n:]]
keywords.append(doc_keywords)
else:
keywords.append(["None Found"])
return keywords

42
mkdocs.yml Normal file
View File

@@ -0,0 +1,42 @@
site_name: KeyBERT
extra_css: [style.css]
repo_url: https://github.com/MaartenGr/keyBERT
site_url: https://maartengr.github.io/keyBERT/
site_description: Leveraging BERT to extract important keywords
site_author: Maarten P. Grootendorst
use_directory_urls: false
nav:
- Home:
- Index: index.md
- API:
- KeyBERT: api/keybert.md
plugins:
- mkdocstrings:
watch:
- keybert
- search
copyright: Copyright &copy; 2020 Maintained by <a href="https://github.com/MaartenGr">Maarten</a>.
theme:
custom_dir: images/
name: material
icon:
logo: material/library
font:
text: Ubuntu
code: Ubuntu Mono
favicon: icon.png
logo: icon.png
feature:
tabs: true
palette:
primary: indigo
accent: blue
markdown_extensions:
- codehilite
- pymdownx.inlinehilite
- pymdownx.details
- pymdownx.tabbed
- pymdownx.highlight:
use_pygments: true
- toc:
permalink: true

58
setup.py Normal file
View File

@@ -0,0 +1,58 @@
import setuptools
test_packages = [
"pytest>=5.4.3",
"pytest-cov>=2.6.1"
]
base_packages = [
"sentence-transformers>=0.3.8",
"scikit-learn>=0.22.2",
"numpy>=1.18.5",
]
docs_packages = [
"mkdocs==1.1",
"mkdocs-material==4.6.3",
"mkdocstrings==0.8.0",
]
dev_packages = docs_packages + test_packages
with open("README.md", "r") as fh:
long_description = fh.read()
setuptools.setup(
name="keybert",
packages=["keybert"],
version="0.0.1",
author="Maarten Grootendorst",
author_email="maartengrootendorst@gmail.com",
description="KeyBERT performs keyword extraction with state-of-the-art transformer models.",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/MaartenGr/keyBERT",
keywords="nlp bert keyword extraction embeddings",
classifiers=[
"Programming Language :: Python",
"Intended Audience :: Science/Research",
"Intended Audience :: Developers",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"License :: OSI Approved :: MIT License",
"Topic :: Scientific/Engineering",
"Operating System :: Microsoft :: Windows",
"Operating System :: POSIX",
"Operating System :: Unix",
"Operating System :: MacOS",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.8",
],
install_requires=base_packages,
extras_require={
"test": test_packages,
"docs": docs_packages,
"dev": dev_packages,
},
python_requires='>=3.6',
)

0
tests/__init__.py Normal file
View File

15
tests/conftest.py Normal file
View File

@@ -0,0 +1,15 @@
# from bertopic import BERTopic
# import pytest
#
#
# @pytest.fixture(scope="module")
# def base_bertopic():
# model = BERTopic(bert_model='distilbert-base-nli-mean-tokens',
# top_n_words=20,
# nr_topics=None,
# n_gram_range=(1, 1),
# min_topic_size=30,
# n_neighbors=15,
# n_components=5,
# verbose=False)
# return model

20
tests/test_model.py Normal file
View File

@@ -0,0 +1,20 @@
# import pytest
# import numpy as np
# import pandas as pd
# from unittest import mock
#
# from sklearn.datasets import fetch_20newsgroups, make_blobs
# from keybert import KeyBERT
#
# newsgroup_docs = fetch_20newsgroups(subset='all')['data'][:1000]
#
# @mock.patch("bertopic.model.BERTopic._extract_embeddings")
# def test_fit_transform(embeddings, base_bertopic):
# """ Test whether predictions are correctly made """
# blobs, _ = make_blobs(n_samples=len(newsgroup_docs), centers=5, n_features=768, random_state=42)
# embeddings.return_value = blobs
# predictions = base_bertopic.fit_transform(newsgroup_docs)
#
# assert isinstance(predictions, list)
# assert len(predictions) == len(newsgroup_docs)
# assert not set(predictions).difference(set(base_bertopic.get_topics().keys()))

0
theme/style.css Normal file
View File