mirror of
https://github.com/gmihaila/ml_things.git
synced 2021-10-04 01:29:04 +03:00
Created using Colaboratory
This commit is contained in:
203
universal_fuzzy_word_matching.ipynb
Normal file
203
universal_fuzzy_word_matching.ipynb
Normal file
@@ -0,0 +1,203 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "universal_fuzzy_word_matching.ipynb",
|
||||
"version": "0.3.2",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/universal_fuzzy_word_matching.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "cT8qtvSwkTjD",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"### Fuzzy Matching any language words\n",
|
||||
"\n",
|
||||
"[Fuzzy String Matching in Python](https://medium.com/@categitau/fuzzy-string-matching-in-python-68f240d910fe)\n",
|
||||
"\n",
|
||||
"[Fuzzy String Matching](https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python/)\n",
|
||||
"\n",
|
||||
"https://github.com/seatgeek/fuzzywuzzy "
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "wfkBq9KakNpv",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# used to clean console\n",
|
||||
"from IPython.display import clear_output\n",
|
||||
"\n",
|
||||
"!pip install fuzzywuzzy[speedup]\n",
|
||||
"\n",
|
||||
"clear_output()\n"
|
||||
],
|
||||
"execution_count": 0,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "SQii1QKTkwrA",
|
||||
"colab_type": "code",
|
||||
"colab": {}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from fuzzywuzzy import fuzz\n",
|
||||
"from fuzzywuzzy import process"
|
||||
],
|
||||
"execution_count": 0,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "3MUnfjerkmRl",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"outputId": "589aafbe-10d7-4749-d3fa-67340a6e109d"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Levenshtein Distance to calculate the differences between sequences\n",
|
||||
"fuzz.ratio(\"Catherine M. Gitau\",\"Catherine Gitau\")"
|
||||
],
|
||||
"execution_count": 8,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"91"
|
||||
]
|
||||
},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"execution_count": 8
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "EmpeTI5HlZ4k",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"outputId": "1bd930e1-b166-4e76-8cff-230f90c5778d"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# takes in the shortest string and matches it with all the sub-strings of length\n",
|
||||
"fuzz.partial_ratio(\"Catherine M. Gitau\",\"Catherine Gitau\")"
|
||||
],
|
||||
"execution_count": 9,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"80"
|
||||
]
|
||||
},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"execution_count": 9
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "FmsOJOAal6pD",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"outputId": "5bf79fe8-9787-430a-d101-5b1ed4b49174"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# attempts to account for similar strings that are out of order\n",
|
||||
"fuzz.token_sort_ratio(\"Catherine Gitau M.\", \"Gitau Catherine\")"
|
||||
],
|
||||
"execution_count": 10,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "execute_result",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"94"
|
||||
]
|
||||
},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"execution_count": 10
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "I9YY0qE_mxXe",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 68
|
||||
},
|
||||
"outputId": "5adb8746-8cf4-4416-d808-8b80c8426973"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# works on any language\n",
|
||||
"\n",
|
||||
"print(fuzz.token_sort_ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))\n",
|
||||
"\n",
|
||||
"print(fuzz.ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))\n",
|
||||
"\n",
|
||||
"print(fuzz.partial_ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))"
|
||||
],
|
||||
"execution_count": 15,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"86\n",
|
||||
"86\n",
|
||||
"82\n"
|
||||
],
|
||||
"name": "stdout"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user