ml_things/fuzzy_match_words_texts.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "fuzzy_match_words_texts.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/fuzzy_match_words_texts.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "metadata": {
        "id": "aIQBDE6O72Ea",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "## Fuzzy match texts\n",
        "\n",
        "[documentation](https://streamhacker.com/2011/10/31/fuzzy-string-matching-python/)"
      ]
    },
    {
      "metadata": {
        "id": "K31wJ4cw7wjt",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "from nltk import metrics, stem, tokenize\n",
        "from nltk.metrics import edit_distance\n",
        "  \n",
        "stemmer = stem.PorterStemmer()\n",
        " \n",
        "def normalize(s):\n",
        "  words = tokenize.wordpunct_tokenize(s.lower().strip())\n",
        "  return ' '.join([stemmer.stem(w) for w in words])\n",
        " \n",
        "def fuzzy_match(s1, s2, max_dist=3):\n",
        "  return edit_distance(normalize(s1), normalize(s2)) <= max_dist\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "cUCHBzvo77aA",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "fabbe4ad-ecbe-4f0a-cb35-7150c934d6db"
      },
      "cell_type": "code",
      "source": [
        "fuzzy_match(\"security breach\", \"security breachess\")"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 2
        }
      ]
    }
  ]
}