{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## sklearn and TextAttack\n",
    "\n",
    "This following code trains two different text classification models using sklearn. Both use logistic regression models: the difference is in the features. \n",
    "\n",
    "We will load data using `nlp`, train the models, and subsequently attack them using TextAttack.\n",
    "\n",
    "### Training\n",
    "\n",
    "This code trains two models: one on bag-of-words statistics (`bow_unstemmed`) and one on tf–idf statistics (`tfidf_unstemmed`). The dataset is the IMDB movie review dataset.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "...successfully loaded training data\n",
      "Total length of training data:  25000\n",
      "...augmented data with len_tokens and average_words\n",
      "...successfully loaded testing data\n",
      "Total length of testing data:  25000\n",
      "...augmented data with len_tokens and average_words\n",
      "...successfully created the unstemmed BOW data\n",
      "...successfully created the unstemmed TFIDF data\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jxm/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training accuracy of BOW Unstemmed:  1.0\n",
      "Testing accuracy of BOW Unstemmed:  0.83864\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      0.85      0.84     12500\n",
      "           1       0.85      0.83      0.84     12500\n",
      "\n",
      "    accuracy                           0.84     25000\n",
      "   macro avg       0.84      0.84      0.84     25000\n",
      "weighted avg       0.84      0.84      0.84     25000\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/jxm/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training accuracy of TFIDF Unstemmed:  0.98864\n",
      "Testing accuracy of TFIDF Unstemmed:  0.85672\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.85      0.87      0.86     12500\n",
      "           1       0.86      0.85      0.86     12500\n",
      "\n",
      "    accuracy                           0.86     25000\n",
      "   macro avg       0.86      0.86      0.86     25000\n",
      "weighted avg       0.86      0.86      0.86     25000\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import nlp\n",
    "import os\n",
    "import pandas as pd\n",
    "import re\n",
    "from nltk import word_tokenize\n",
    "from nltk.stem import PorterStemmer\n",
    "from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS\n",
    "from sklearn import preprocessing\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# Nice to see additional metrics\n",
    "from sklearn.metrics import classification_report\n",
    "\n",
    "def load_data(dataset_split='train'):\n",
    "    dataset = nlp.load_dataset('imdb')[dataset_split]\n",
    "    # Open and import positve data\n",
    "    df = pd.DataFrame()\n",
    "    df['Review'] = [review['text'] for review in dataset]\n",
    "    df['Sentiment'] = [review['label'] for review in dataset]\n",
    "    # Remove non-alphanumeric characters\n",
    "    df['Review'] = df['Review'].apply(lambda x: re.sub(\"[^a-zA-Z]\", ' ', str(x)))\n",
    "    # Tokenize the training and testing data\n",
    "    df_tokenized = tokenize_review(df)\n",
    "    return df_tokenized\n",
    "\n",
    "def tokenize_review(df):\n",
    "    # Tokenize Reviews in training\n",
    "    tokened_reviews = [word_tokenize(rev) for rev in df['Review']]\n",
    "    # Create word stems\n",
    "    stemmed_tokens = []\n",
    "    porter = PorterStemmer()\n",
    "    for i in range(len(tokened_reviews)):\n",
    "        stems = [porter.stem(token) for token in tokened_reviews[i]]\n",
    "        stems = ' '.join(stems)\n",
    "        stemmed_tokens.append(stems)\n",
    "    df.insert(1, column='Stemmed', value=stemmed_tokens)\n",
    "    return df\n",
    "\n",
    "def transform_BOW(training, testing, column_name):\n",
    "    vect = CountVectorizer(max_features=10000, ngram_range=(1,3), stop_words=ENGLISH_STOP_WORDS)\n",
    "    vectFit = vect.fit(training[column_name])\n",
    "    BOW_training = vectFit.transform(training[column_name])\n",
    "    BOW_training_df = pd.DataFrame(BOW_training.toarray(), columns=vect.get_feature_names())\n",
    "    BOW_testing = vectFit.transform(testing[column_name])\n",
    "    BOW_testing_Df = pd.DataFrame(BOW_testing.toarray(), columns=vect.get_feature_names())\n",
    "    return vectFit, BOW_training_df, BOW_testing_Df\n",
    "\n",
    "def transform_tfidf(training, testing, column_name):\n",
    "    Tfidf = TfidfVectorizer(ngram_range=(1,3), max_features=10000, stop_words=ENGLISH_STOP_WORDS)\n",
    "    Tfidf_fit = Tfidf.fit(training[column_name])\n",
    "    Tfidf_training = Tfidf_fit.transform(training[column_name])\n",
    "    Tfidf_training_df = pd.DataFrame(Tfidf_training.toarray(), columns=Tfidf.get_feature_names())\n",
    "    Tfidf_testing = Tfidf_fit.transform(testing[column_name])\n",
    "    Tfidf_testing_df = pd.DataFrame(Tfidf_testing.toarray(), columns=Tfidf.get_feature_names())\n",
    "    return Tfidf_fit, Tfidf_training_df, Tfidf_testing_df\n",
    "\n",
    "def add_augmenting_features(df):\n",
    "    tokened_reviews = [word_tokenize(rev) for rev in df['Review']]\n",
    "    # Create feature that measures length of reviews\n",
    "    len_tokens = []\n",
    "    for i in range(len(tokened_reviews)):\n",
    "        len_tokens.append(len(tokened_reviews[i]))\n",
    "    len_tokens = preprocessing.scale(len_tokens)\n",
    "    df.insert(0, column='Lengths', value=len_tokens)\n",
    "\n",
    "    # Create average word length (training)\n",
    "    Average_Words = [len(x)/(len(x.split())) for x in df['Review'].tolist()]\n",
    "    Average_Words = preprocessing.scale(Average_Words)\n",
    "    df['averageWords'] = Average_Words\n",
    "    return df\n",
    "\n",
    "def build_model(X_train, y_train, X_test, y_test, name_of_test):\n",
    "    log_reg = LogisticRegression(C=30, max_iter=200).fit(X_train, y_train)\n",
    "    y_pred = log_reg.predict(X_test)\n",
    "    print('Training accuracy of '+name_of_test+': ', log_reg.score(X_train, y_train))\n",
    "    print('Testing accuracy of '+name_of_test+': ', log_reg.score(X_test, y_test))\n",
    "    print(classification_report(y_test, y_pred))  # Evaluating prediction ability\n",
    "    return log_reg\n",
    "\n",
    "# Load training and test sets\n",
    "# Loading reviews into DF\n",
    "df_train = load_data('train')\n",
    "\n",
    "print('...successfully loaded training data')\n",
    "print('Total length of training data: ', len(df_train))\n",
    "# Add augmenting features\n",
    "df_train = add_augmenting_features(df_train)\n",
    "print('...augmented data with len_tokens and average_words')\n",
    "\n",
    "# Load test DF\n",
    "df_test = load_data('test')\n",
    "\n",
    "print('...successfully loaded testing data')\n",
    "print('Total length of testing data: ', len(df_test))\n",
    "df_test = add_augmenting_features(df_test)\n",
    "print('...augmented data with len_tokens and average_words')\n",
    "\n",
    "# Create unstemmed BOW features for training set\n",
    "unstemmed_BOW_vect_fit, df_train_bow_unstem, df_test_bow_unstem = transform_BOW(df_train, df_test, 'Review')\n",
    "print('...successfully created the unstemmed BOW data')\n",
    "\n",
    "# Create TfIdf features for training set\n",
    "unstemmed_tfidf_vect_fit, df_train_tfidf_unstem, df_test_tfidf_unstem = transform_tfidf(df_train, df_test, 'Review')\n",
    "print('...successfully created the unstemmed TFIDF data')\n",
    "\n",
    "# Running logistic regression on dataframes\n",
    "bow_unstemmed = build_model(df_train_bow_unstem, df_train['Sentiment'], df_test_bow_unstem, df_test['Sentiment'], 'BOW Unstemmed')\n",
    "\n",
    "tfidf_unstemmed = build_model(df_train_tfidf_unstem, df_train['Sentiment'], df_test_tfidf_unstem, df_test['Sentiment'], 'TFIDF Unstemmed')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Attacking\n",
    "\n",
    "TextAttack includes a build-in `SklearnModelWrapper` that can run attacks on most sklearn models. (If your tokenization strategy is different than above, you may need to subclass `SklearnModelWrapper` to make sure the model inputs & outputs come in the correct format.)\n",
    "\n",
    "Once we initializes the model wrapper, we load a few samples from the IMDB dataset and run the `TextFoolerJin2019` attack on our model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textattack.models.wrappers import SklearnModelWrapper\n",
    "\n",
    "model_wrapper = SklearnModelWrapper(bow_unstemmed, unstemmed_BOW_vect_fit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textattack.datasets import HuggingFaceNlpDataset\n",
    "from textattack.attack_recipes import TextFoolerJin2019\n",
    "\n",
    "dataset = HuggingFaceNlpDataset(\"imdb\", None, \"train\")\n",
    "attack = TextFoolerJin2019(model_wrapper)\n",
    "\n",
    "results = attack.attack_dataset(dataset, indices=range(20))\n",
    "for idx, result in enumerate(results):\n",
    "    print(f'Result {idx}:')\n",
    "    print(result.__str__(color_method='ansi'))\n",
    "    print('\\n' + ('*' * 40) + '\\n')\n",
    "print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Conclusion\n",
    "We were able to train a model on the IMDB dataset using `sklearn` and use it in TextAttack by initializing with the `SklearnModelWrapper`. It's that simple!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}