1
0
mirror of https://github.com/QData/TextAttack.git synced 2021-10-13 00:05:06 +03:00
Files
textattack-nlp-transformer/textattack/datasets/dataset.py
2020-05-02 20:57:42 -04:00

69 lines
2.2 KiB
Python

from textattack.shared import utils
import pickle
class TextAttackDataset:
"""
A dataset for text attacks.
Any iterable of (label, text_input) pairs qualifies as
a TextAttackDataset.
"""
def __init__(self):
""" Loads a full dataset from disk. """
raise NotImplementedError()
def __iter__(self):
return self
def _process_example(self, raw_line):
""" Processes each example read from a file. Implemented on a dataset-
by-dataset basis.
Args:
raw_line (str): Line of the example to process.
Returns:
A tuple of text objects
"""
raise NotImplementedError()
def __next__(self):
if self.i >= len(self.examples):
raise StopIteration
example = self.examples[self.i]
self.i += 1
return example
def _load_pickle_file(self, file_name, offset=0):
self.i = 0
file_path = utils.download_if_needed(file_name)
self.examples = pickle.load( open(file_path, "rb" ) )
self.examples = self.examples[offset:]
def _load_classification_text_file(self, text_file_name, offset=0):
""" Loads tuples from lines of a classification text file.
Format must look like:
1 this is a great little ...
0 "i love hot n juicy . ...
0 "\""this world needs a ...
Arguments:
n (int): number of samples to return
offset (int): line to start reading from
"""
text_file_path = utils.download_if_needed(text_file_name)
text_file = open(text_file_path, 'r')
raw_lines = text_file.readlines()[offset:]
raw_lines = [self._clean_example(ex) for ex in raw_lines]
self.examples = [self._process_example_from_file(ex) for ex in raw_lines]
self.i = 0
text_file.close()
def _clean_example(self, ex):
""" Optionally pre-processes an input string before some tokenization.
Only necessary for some datasets. """
return ex