mirror of
https://github.com/gmihaila/ml_things.git
synced 2021-10-04 01:29:04 +03:00
94 lines
2.4 KiB
Plaintext
94 lines
2.4 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "fuzzy_match_words_texts.ipynb",
|
|
"version": "0.3.2",
|
|
"provenance": [],
|
|
"collapsed_sections": [],
|
|
"include_colab_link": true
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "view-in-github",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/fuzzy_match_words_texts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
]
|
|
},
|
|
{
|
|
"metadata": {
|
|
"id": "aIQBDE6O72Ea",
|
|
"colab_type": "text"
|
|
},
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"## Fuzzy match texts\n",
|
|
"\n",
|
|
"[documentation](https://streamhacker.com/2011/10/31/fuzzy-string-matching-python/)"
|
|
]
|
|
},
|
|
{
|
|
"metadata": {
|
|
"id": "K31wJ4cw7wjt",
|
|
"colab_type": "code",
|
|
"colab": {}
|
|
},
|
|
"cell_type": "code",
|
|
"source": [
|
|
"from nltk import metrics, stem, tokenize\n",
|
|
"from nltk.metrics import edit_distance\n",
|
|
" \n",
|
|
"stemmer = stem.PorterStemmer()\n",
|
|
" \n",
|
|
"def normalize(s):\n",
|
|
" words = tokenize.wordpunct_tokenize(s.lower().strip())\n",
|
|
" return ' '.join([stemmer.stem(w) for w in words])\n",
|
|
" \n",
|
|
"def fuzzy_match(s1, s2, max_dist=3):\n",
|
|
" return edit_distance(normalize(s1), normalize(s2)) <= max_dist\n"
|
|
],
|
|
"execution_count": 0,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"metadata": {
|
|
"id": "cUCHBzvo77aA",
|
|
"colab_type": "code",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 34
|
|
},
|
|
"outputId": "fabbe4ad-ecbe-4f0a-cb35-7150c934d6db"
|
|
},
|
|
"cell_type": "code",
|
|
"source": [
|
|
"fuzzy_match(\"security breach\", \"security breachess\")"
|
|
],
|
|
"execution_count": 2,
|
|
"outputs": [
|
|
{
|
|
"output_type": "execute_result",
|
|
"data": {
|
|
"text/plain": [
|
|
"True"
|
|
]
|
|
},
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"execution_count": 2
|
|
}
|
|
]
|
|
}
|
|
]
|
|
} |