Files
ml_things/fuzzy_match_words_texts.ipynb
2019-01-20 19:04:49 -06:00

94 lines
2.4 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "fuzzy_match_words_texts.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/fuzzy_match_words_texts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "aIQBDE6O72Ea",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"## Fuzzy match texts\n",
"\n",
"[documentation](https://streamhacker.com/2011/10/31/fuzzy-string-matching-python/)"
]
},
{
"metadata": {
"id": "K31wJ4cw7wjt",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from nltk import metrics, stem, tokenize\n",
"from nltk.metrics import edit_distance\n",
" \n",
"stemmer = stem.PorterStemmer()\n",
" \n",
"def normalize(s):\n",
" words = tokenize.wordpunct_tokenize(s.lower().strip())\n",
" return ' '.join([stemmer.stem(w) for w in words])\n",
" \n",
"def fuzzy_match(s1, s2, max_dist=3):\n",
" return edit_distance(normalize(s1), normalize(s2)) <= max_dist\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "cUCHBzvo77aA",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "fabbe4ad-ecbe-4f0a-cb35-7150c934d6db"
},
"cell_type": "code",
"source": [
"fuzzy_match(\"security breach\", \"security breachess\")"
],
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
}
]
}