Created using Colaboratory

This commit is contained in:
George Mihaila
2019-01-25 23:27:33 -06:00
parent 758b72c586
commit 82f944cc92

View File

@@ -0,0 +1,203 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "universal_fuzzy_word_matching.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/universal_fuzzy_word_matching.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "cT8qtvSwkTjD",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"### Fuzzy Matching any language words\n",
"\n",
"[Fuzzy String Matching in Python](https://medium.com/@categitau/fuzzy-string-matching-in-python-68f240d910fe)\n",
"\n",
"[Fuzzy String Matching](https://marcobonzanini.com/2015/02/25/fuzzy-string-matching-in-python/)\n",
"\n",
"https://github.com/seatgeek/fuzzywuzzy "
]
},
{
"metadata": {
"id": "wfkBq9KakNpv",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# used to clean console\n",
"from IPython.display import clear_output\n",
"\n",
"!pip install fuzzywuzzy[speedup]\n",
"\n",
"clear_output()\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "SQii1QKTkwrA",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from fuzzywuzzy import fuzz\n",
"from fuzzywuzzy import process"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "3MUnfjerkmRl",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "589aafbe-10d7-4749-d3fa-67340a6e109d"
},
"cell_type": "code",
"source": [
"# Levenshtein Distance to calculate the differences between sequences\n",
"fuzz.ratio(\"Catherine M. Gitau\",\"Catherine Gitau\")"
],
"execution_count": 8,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"91"
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"metadata": {
"id": "EmpeTI5HlZ4k",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "1bd930e1-b166-4e76-8cff-230f90c5778d"
},
"cell_type": "code",
"source": [
"# takes in the shortest string and matches it with all the sub-strings of length\n",
"fuzz.partial_ratio(\"Catherine M. Gitau\",\"Catherine Gitau\")"
],
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"80"
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"metadata": {
"id": "FmsOJOAal6pD",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "5bf79fe8-9787-430a-d101-5b1ed4b49174"
},
"cell_type": "code",
"source": [
"# attempts to account for similar strings that are out of order\n",
"fuzz.token_sort_ratio(\"Catherine Gitau M.\", \"Gitau Catherine\")"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"94"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"metadata": {
"id": "I9YY0qE_mxXe",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 68
},
"outputId": "5adb8746-8cf4-4416-d808-8b80c8426973"
},
"cell_type": "code",
"source": [
"# works on any language\n",
"\n",
"print(fuzz.token_sort_ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))\n",
"\n",
"print(fuzz.ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))\n",
"\n",
"print(fuzz.partial_ratio(\"Dieser Apfel ist einen Telefon\", \"Diese Apfel sind ein Telefon\"))"
],
"execution_count": 15,
"outputs": [
{
"output_type": "stream",
"text": [
"86\n",
"86\n",
"82\n"
],
"name": "stdout"
}
]
}
]
}