resolves #20, #104 - added pandas support and option for transforming data in learner

2022-05-17 00:31:33 +03:00 · 2020-09-24 18:12:59 +02:00
parent ff7a52f4cf
commit 8e0cb25029
22 changed files with 217 additions and 160 deletions
--- a/README.md
+++ b/README.md
@@ -100,12 +100,11 @@ import numpy as np
 X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
 y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)
 ```
-For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```.
+For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```.
 ```python
 def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
-    query_idx = np.argmax(std)
-    return query_idx, X[query_idx]
+    return np.argmax(std)
 ```
 After setting up the query strategy and the data, the active learner can be initialized.
 ```python
--- a/docs/source/content/examples/active_regression.ipynb
+++ b/docs/source/content/examples/active_regression.ipynb
@@ -70,7 +70,7 @@
   "metadata": {},
   "source": [
    "## Uncertainty measure and query strategy for Gaussian processes\n",
-    "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```."
+    "For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```."
   ]
  },
  {
@@ -81,8 +81,7 @@
   "source": [
    "def GP_regression_std(regressor, X):\n",
    "    _, std = regressor.predict(X, return_std=True)\n",
-    "    query_idx = np.argmax(std)\n",
-    "    return query_idx, X[query_idx]"
+    "    return np.argmax(std)"
   ]
  },
  {
@@ -234,4 +233,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/docs/source/content/overview/Extending-modAL.ipynb
+++ b/docs/source/content/overview/Extending-modAL.ipynb
@@ -27,11 +27,8 @@
    "    # measure the utility of each instance in the pool\n",
    "    utility = utility_measure(classifier, X)\n",
    "\n",
-    "    # select the indices of the instances to be queried\n",
-    "    query_idx = select_instances(utility)\n",
-    "\n",
-    "    # return the indices and the instances\n",
-    "    return query_idx, X[query_idx]"
+    "    # select and return the indices of the instances to be queried\n",
+    "    return select_instances(utility)"
   ]
  },
  {
@@ -213,8 +210,7 @@
    "# classifier uncertainty and classifier margin\n",
    "def custom_query_strategy(classifier, X, n_instances=1):\n",
    "    utility = linear_combination(classifier, X)\n",
-    "    query_idx = multi_argmax(utility, n_instances=n_instances)\n",
-    "    return query_idx, X[query_idx]\n",
+    "    return multi_argmax(utility, n_instances=n_instances)\n",
    "\n",
    "custom_query_learner = ActiveLearner(\n",
    "    estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),\n",
@@ -299,4 +295,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/docs/source/content/overview/modAL-in-a-nutshell.rst
+++ b/docs/source/content/overview/modAL-in-a-nutshell.rst
@@ -118,15 +118,13 @@ the *noisy sine* function:
 For active learning, we shall define a custom query strategy tailored to
 Gaussian processes. In a nutshell, a *query stategy* in modAL is a
 function taking (at least) two arguments (an estimator object and a pool
-of examples), outputting the index of the queried instance and the
-instance itself. In our case, the arguments are ``regressor`` and ``X``.
+of examples), outputting the index of the queried instance. In our case, the arguments are ``regressor`` and ``X``.

 .. code:: python

    def GP_regression_std(regressor, X):
        _, std = regressor.predict(X, return_std=True)
-        query_idx = np.argmax(std)
-        return query_idx, X[query_idx]
+        return np.argmax(std)

 After setting up the query strategy and the data, the active learner can
 be initialized.
--- a/examples/active_regression.py
+++ b/examples/active_regression.py
@@ -12,8 +12,7 @@ from modAL.models import ActiveLearner
 # query strategy for regression
 def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
-    query_idx = np.argmax(std)
-    return query_idx, X[query_idx]
+    return np.argmax(std)


 # generating the data
--- a/examples/custom_query_strategies.py
+++ b/examples/custom_query_strategies.py
@@ -5,18 +5,16 @@ Template for query strategies

 The first two arguments of a query strategy function is always the estimator and the pool
 of instances to be queried from. Additional arguments are accepted as keyword arguments.
-A valid query strategy function always returns a tuple of the indices of the queried
-instances and the instances themselves.
+A valid query strategy function always returns indices of the queried
+instances.

 def custom_query_strategy(classifier, X, a_keyword_argument=42):
    # measure the utility of each instance in the pool
    utility = utility_measure(classifier, X)

-    # select the indices of the instances to be queried
-    query_idx = select_instances(utility)
+    # select and return the indices of the instances to be queried
+    return select_instances(utility)

-    # return the indices and the instances
-    return query_idx, X[query_idx]

 This function can be used in the active learning workflow.

@@ -97,8 +95,7 @@ with plt.style.context('seaborn-white'):
 # classifier uncertainty and classifier margin
 def custom_query_strategy(classifier, X, n_instances=1):
    utility = linear_combination(classifier, X)
-    query_idx = multi_argmax(utility, n_instances=n_instances)
-    return query_idx, X[query_idx]
+    return multi_argmax(utility, n_instances=n_instances)

 custom_query_learner = ActiveLearner(
    estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),
--- a/examples/deep_bayesian_active_learning.py
+++ b/examples/deep_bayesian_active_learning.py
@@ -62,12 +62,10 @@ def max_entropy(learner, X, n_instances=1, T=100):
    expected_p = np.mean(MC_samples, axis=0)
    acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1)  # [batch size]
    idx = (-acquisition).argsort()[:n_instances]
-    query_idx = random_subset[idx]
-    return query_idx, X[query_idx]
+    return random_subset[idx]

 def uniform(learner, X, n_instances=1):
-    query_idx = np.random.choice(range(len(X)), size=n_instances, replace=False)
-    return query_idx, X[query_idx]
+    return np.random.choice(range(len(X)), size=n_instances, replace=False)

 """
 Training the ActiveLearner
--- a/examples/shape_learning.py
+++ b/examples/shape_learning.py
@@ -57,8 +57,7 @@ final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_wid


 def random_sampling(classsifier, X):
-    query_idx = np.random.randint(len(X))
-    return query_idx, X[query_idx]
+    return np.random.randint(len(X))


 X_pool = deepcopy(X_full)
--- a/modAL/acquisition.py
+++ b/modAL/acquisition.py
@@ -104,7 +104,7 @@ Query strategies using acquisition functions


 def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
-           n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+           n_instances: int = 1) -> np.ndarray:
    """
    Maximum PI query strategy. Selects the instance with highest probability of improvement.

@@ -118,13 +118,11 @@ def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    pi = optimizer_PI(optimizer, X, tradeoff=tradeoff)
-    query_idx = multi_argmax(pi, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(pi, n_instances=n_instances)


 def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
-           n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+           n_instances: int = 1) -> np.ndarray:
    """
    Maximum EI query strategy. Selects the instance with highest expected improvement.

@@ -138,13 +136,11 @@ def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    ei = optimizer_EI(optimizer, X, tradeoff=tradeoff)
-    query_idx = multi_argmax(ei, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(ei, n_instances=n_instances)


 def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
-            n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
+            n_instances: int = 1) -> np.ndarray:
    """
    Maximum UCB query strategy. Selects the instance with highest upper confidence bound.

@@ -158,6 +154,4 @@ def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
    """
    ucb = optimizer_UCB(optimizer, X, beta=beta)
-    query_idx = multi_argmax(ucb, n_instances=n_instances)
-
-    return query_idx, X[query_idx]
+    return multi_argmax(ucb, n_instances=n_instances)
--- a/modAL/batch.py
+++ b/modAL/batch.py
@@ -114,7 +114,7 @@ def select_instance(
    unlabeled_indices = [i for i in range(n_pool) if mask[i]]
    best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled]
    mask[best_instance_index] = 0
-    return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask
+    return best_instance_index, X_pool[[best_instance_index]], mask


 def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
@@ -142,11 +142,16 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
    """
    # Make a local copy of our classifier's training data.
    # Define our record container and record the best cold start instance in the case of cold start.
+
+    # transform unlabeled data if needed
+    if classifier.on_transformed:
+        unlabeled = classifier.transform_without_estimating(unlabeled)
+
    if classifier.X_training is None:
        best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
        instance_index_ranking = [best_coldstart_instance_index]
    elif classifier.X_training.shape[0] > 0:
-        labeled = classifier.X_training[:]
+        labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
        instance_index_ranking = []
    
    # The maximum number of records to sample.
@@ -180,7 +185,7 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
                               metric: Union[str, Callable] = 'euclidean',
                               n_jobs: Optional[int] = None,
                               **uncertainty_measure_kwargs
-                               ) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
+                               ) -> np.ndarray:
    """
    Batch sampling query strategy. Selects the least sure instances for labelling.

@@ -206,6 +211,6 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
        Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
    """
    uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
-    query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
+    return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
                                 n_instances=n_instances, metric=metric, n_jobs=n_jobs)
-    return query_indices, X[query_indices]
+
--- a/modAL/disagreement.py
+++ b/modAL/disagreement.py
@@ -104,7 +104,7 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba

 def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
                          n_instances: int = 1, random_tie_break=False,
-                          **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                          **disagreement_measure_kwargs) -> np.ndarray:
    """
    Vote entropy sampling strategy.

@@ -124,16 +124,14 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
    disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)


 def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
                               n_instances: int = 1, random_tie_break=False,
-                               **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                               **disagreement_measure_kwargs) -> np.ndarray:
    """
    Consensus entropy sampling strategy.

@@ -153,16 +151,14 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
    disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)


 def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
                              n_instances: int = 1, random_tie_break=False,
-                              **disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                              **disagreement_measure_kwargs) -> np.ndarray:
    """
    Maximum disagreement sampling strategy.

@@ -182,16 +178,14 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
    disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(disagreement, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
+        return multi_argmax(disagreement, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(disagreement, n_instances=n_instances)


 def max_std_sampling(regressor: BaseEstimator, X: modALinput,
                     n_instances: int = 1,  random_tie_break=False,
-                     **predict_kwargs) -> Tuple[np.ndarray, modALinput]:
+                     **predict_kwargs) -> np.ndarray:
    """
    Regressor standard deviation sampling strategy.

@@ -211,8 +205,6 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput,
    std = std.reshape(X.shape[0], )

    if not random_tie_break:
-        query_idx = multi_argmax(std, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(std, n_instances=n_instances)
+        return multi_argmax(std, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(std, n_instances=n_instances)
--- a/modAL/expected_error.py
+++ b/modAL/expected_error.py
@@ -10,14 +10,14 @@ from sklearn.base import clone
 from sklearn.exceptions import NotFittedError

 from modAL.models import ActiveLearner
-from modAL.utils.data import modALinput, data_vstack
+from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows
 from modAL.utils.selection import multi_argmax, shuffled_argmax
 from modAL.uncertainty import _proba_uncertainty, _proba_entropy


 def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
                             p_subsample: np.float = 1.0, n_instances: int = 1,
-                             random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                             random_tie_break: bool = False) -> np.ndarray:
    """
    Expected error reduction query strategy.

@@ -52,17 +52,17 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
        X_proba = learner.predict_proba(X)
    except NotFittedError:
        # TODO: implement a proper cold-start
-        return 0, X[0]
+        return np.array([0])

    cloned_estimator = clone(learner.estimator)

-    for x_idx, x in enumerate(X):
+    for x_idx, x in enumerate_data(X):
        # subsample the data if needed
        if np.random.rand() <= p_subsample:
-            X_reduced = np.delete(X, x_idx, axis=0)
+            X_reduced = drop_rows(X, x_idx)
            # estimate the expected error
            for y_idx, y in enumerate(possible_labels):
-                X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0)))
+                X_new = data_vstack((learner.X_training, [x]))
                y_new = data_vstack((learner.y_training, np.array(y).reshape(1,)))

                cloned_estimator.fit(X_new, y_new)
@@ -78,8 +78,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
            expected_error[x_idx] = np.inf

    if not random_tie_break:
-        query_idx = multi_argmax(-expected_error, n_instances)
-    else:
-        query_idx = shuffled_argmax(-expected_error, n_instances)
+        return multi_argmax(-expected_error, n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(-expected_error, n_instances)
--- a/modAL/models/base.py
+++ b/modAL/models/base.py
@@ -5,14 +5,18 @@ Base classes for active learning algorithms

 import abc
 import sys
+import warnings
 from typing import Union, Callable, Optional, Tuple, List, Iterator, Any

 import numpy as np
 from sklearn.base import BaseEstimator
+from sklearn.ensemble._base import _BaseHeterogeneousEnsemble
+from sklearn.pipeline import Pipeline
 from sklearn.utils import check_X_y

-from modAL.utils.data import data_vstack, modALinput
+import scipy.sparse as sp

+from modAL.utils.data import data_vstack, modALinput, retrieve_rows

 if sys.version_info >= (3, 4):
    ABC = abc.ABC
@@ -34,6 +38,8 @@ class BaseLearner(ABC, BaseEstimator):
            When False, accepts np.nan and np.inf values.
        bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
            Useful when building Committee models with bagging.
+        on_transformed: Whether to transform samples with the pipeline defined by the estimator
+            when applying the query strategy.
        **fit_kwargs: keyword arguments.

    Attributes:
@@ -49,6 +55,7 @@ class BaseLearner(ABC, BaseEstimator):
                 X_training: Optional[modALinput] = None,
                 y_training: Optional[modALinput] = None,
                 bootstrap_init: bool = False,
+                 on_transformed: bool = False,
                 force_all_finite: bool = True,
                 **fit_kwargs
                 ) -> None:
@@ -56,11 +63,14 @@ class BaseLearner(ABC, BaseEstimator):

        self.estimator = estimator
        self.query_strategy = query_strategy
+        self.on_transformed = on_transformed

        self.X_training = X_training
+        self.Xt_training = None
        self.y_training = y_training
        if X_training is not None:
            self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
+            self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None

        assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
        self.force_all_finite = force_all_finite
@@ -82,15 +92,65 @@ class BaseLearner(ABC, BaseEstimator):

        if self.X_training is None:
            self.X_training = X
+            self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
            self.y_training = y
        else:
            try:
                self.X_training = data_vstack((self.X_training, X))
+                self.Xt_training = data_vstack((
+                    self.Xt_training,
+                    self.transform_without_estimating(X)
+                )) if self.on_transformed else None
                self.y_training = data_vstack((self.y_training, y))
            except ValueError:
                raise ValueError('the dimensions of the new training data and label must'
                                 'agree with the training data and labels provided so far')

+    def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]:
+        """
+        Transforms the data as supplied to the estimator.
+
+        * In case the estimator is an skearn pipeline, it applies all pipeline components but the last one.
+        * In case the estimator is an ensemble, it concatenates the transformations for each classfier
+            (pipeline) in the ensemble.
+        * Otherwise returns the non-transformed dataset X
+        Args:
+            X: dataset to be transformed
+
+        Returns:
+            Transformed data set
+        """
+        Xt = []
+        pipes = [self.estimator]
+
+        if isinstance(self.estimator, _BaseHeterogeneousEnsemble):
+            pipes = self.estimator.estimators_
+
+        ################################
+        # transform data with pipelines used by estimator
+        for pipe in pipes:
+            if isinstance(pipe, Pipeline):
+                # NOTE: The used pipeline class might be an extension to sklearn's!
+                #       Create a new instance of the used pipeline class with all
+                #       components but the final estimator.
+                transformation_pipe = pipe.__class__(steps=pipe.steps[:-1])
+                Xt.append(transformation_pipe.transform(X))
+
+        # in case no transformation pipelines are used by the estimator,
+        # return the original, non-transfored data
+        if not Xt:
+            return X
+
+        ################################
+        # concatenate all transformations and return
+        # TODO: maybe use a newly implemented data_hstack() instead
+
+        # use sparse representation if any of the pipelines do
+        if any([isinstance(Xti, sp.csr_matrix) for Xti in Xt]):
+            return sp.hstack([sp.csc_matrix(Xti) for Xti in Xt])
+
+        return np.hstack(Xt)
+
    def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
        """
        Fits self.estimator to the training data and labels provided to it so far.
@@ -185,11 +245,12 @@ class BaseLearner(ABC, BaseEstimator):
        """
        return self.estimator.predict_proba(X, **predict_proba_kwargs)

-    def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
+    def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
        """
        Finds the n_instances most informative point in the data provided by calling the query_strategy function.

        Args:
+            X_pool: Pool of unlabeled instances to retrieve most informative instances from
            *query_args: The arguments for the query strategy. For instance, in the case of
                :func:`~modAL.uncertainty.uncertainty_sampling`, it is the pool of samples from which the query strategy
                should choose instances to request labels.
@@ -200,8 +261,15 @@ class BaseLearner(ABC, BaseEstimator):
            labelled and the instances themselves. Can be different in other cases, for instance only the instance to be
            labelled upon query synthesis.
        """
-        query_result = self.query_strategy(self, *query_args, **query_kwargs)
-        return query_result
+        query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs)
+
+        if isinstance(query_result, tuple):
+            warnings.warn("Query strategies should no longer return the selected instances, "
+                          "this is now handled by the query method. "
+                          "Please return only the indices of the selected instances.", DeprecationWarning)
+            return query_result
+
+        return query_result, retrieve_rows(X_pool, query_result)

    def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any:
        """
@@ -301,11 +369,12 @@ class BaseCommittee(ABC, BaseEstimator):

        return self

-    def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
+    def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
        """
        Finds the n_instances most informative point in the data provided by calling the query_strategy function.

        Args:
+            X_pool: Pool of unlabeled instances to retrieve most informative instances from
            *query_args: The arguments for the query strategy. For instance, in the case of
                :func:`~modAL.disagreement.max_disagreement_sampling`, it is the pool of samples from which the query.
                strategy should choose instances to request labels.
@@ -316,8 +385,15 @@ class BaseCommittee(ABC, BaseEstimator):
            be labelled and the instances themselves. Can be different in other cases, for instance only the instance to
            be labelled upon query synthesis.
        """
-        query_result = self.query_strategy(self, *query_args, **query_kwargs)
-        return query_result
+        query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs)
+
+        if isinstance(query_result, tuple):
+            warnings.warn("Query strategies should no longer return the selected instances, "
+                          "this is now handled by the query method. "
+                          "Please return only the indices of the selected instances", DeprecationWarning)
+            return query_result
+
+        return query_result, retrieve_rows(X_pool, query_result)

    def rebag(self, **fit_kwargs) -> None:
        """
--- a/modAL/models/learners.py
+++ b/modAL/models/learners.py
@@ -30,6 +30,8 @@ class ActiveLearner(BaseLearner):
        y_training: Initial training labels corresponding to initial training samples.
        bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
            Useful when building Committee models with bagging.
+        on_transformed: Whether to transform samples with the pipeline defined by the estimator
+            when applying the query strategy.
        **fit_kwargs: keyword arguments.

    Attributes:
@@ -73,10 +75,11 @@ class ActiveLearner(BaseLearner):
                 X_training: Optional[modALinput] = None,
                 y_training: Optional[modALinput] = None,
                 bootstrap_init: bool = False,
+                 on_transformed: bool = False,
                 **fit_kwargs
                 ) -> None:
        super().__init__(estimator, query_strategy,
-                         X_training, y_training, bootstrap_init, **fit_kwargs)
+                         X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs)

    def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None:
        """
@@ -177,9 +180,10 @@ class BayesianOptimizer(BaseLearner):
                 X_training: Optional[modALinput] = None,
                 y_training: Optional[modALinput] = None,
                 bootstrap_init: bool = False,
+                 on_transformed: bool = False,
                 **fit_kwargs) -> None:
        super(BayesianOptimizer, self).__init__(estimator, query_strategy,
-                                                X_training, y_training, bootstrap_init, **fit_kwargs)
+                                                X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs)
        # setting the maximum value
        if self.y_training is not None:
            max_idx = np.argmax(self.y_training)
@@ -481,8 +485,7 @@ class CommitteeRegressor(BaseCommittee):
        >>> # query strategy for regression
        >>> def ensemble_regression_std(regressor, X):
        ...     _, std = regressor.predict(X, return_std=True)
-        ...     query_idx = np.argmax(std)
-        ...     return query_idx, X[query_idx]
+        ...     return np.argmax(std)
        >>>
        >>> # initializing the CommitteeRegressor
        >>> committee = CommitteeRegressor(
--- a/modAL/multilabel.py
+++ b/modAL/multilabel.py
@@ -43,7 +43,7 @@ def _SVM_loss(multiclass_classifier: ActiveLearner,


 def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
-                       random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                       random_tie_break: bool = False) -> np.ndarray:
    """
    SVM binary minimum multilabel active learning strategy. For details see the paper
    Klaus Brinker, On Active Learning in Multi-label Classification
@@ -67,15 +67,13 @@ def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
    min_abs_dist = np.min(np.abs(decision_function), axis=1)

    if not random_tie_break:
-        query_idx = np.argmin(min_abs_dist)
-    else:
-        query_idx = shuffled_argmax(min_abs_dist)
+        return np.argmin(min_abs_dist)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(min_abs_dist)


 def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
-             n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+             n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:

    """
    Max Loss query strategy for SVM multilabel classification.
@@ -103,15 +101,13 @@ def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
    loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes)

    if not random_tie_break:
-        query_idx = multi_argmax(loss, n_instances)
-    else:
-        query_idx = shuffled_argmax(loss, n_instances)
+        return multi_argmax(loss, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(loss, n_instances)


 def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
-                  n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                  n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
    """
    Mean Max Loss query strategy for SVM multilabel classification.

@@ -136,15 +132,13 @@ def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
    loss = _SVM_loss(classifier, X_pool)

    if not random_tie_break:
-        query_idx = multi_argmax(loss, n_instances)
-    else:
-        query_idx = shuffled_argmax(loss, n_instances)
+        return multi_argmax(loss, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(loss, n_instances)


 def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
-                   n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                   n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
    """
    MinConfidence query strategy for multilabel classification.

@@ -167,15 +161,13 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
    classwise_min = np.min(classwise_confidence, axis=1)

    if not random_tie_break:
-        query_idx = multi_argmax(-classwise_min, n_instances)
-    else:
-        query_idx = shuffled_argmax(-classwise_min, n_instances)
+        return multi_argmax(-classwise_min, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(-classwise_min, n_instances)


 def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
-                   n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+                   n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
    """
    AvgConfidence query strategy for multilabel classification.

@@ -198,15 +190,13 @@ def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
    classwise_mean = np.mean(classwise_confidence, axis=1)

    if not random_tie_break:
-        query_idx = multi_argmax(classwise_mean, n_instances)
-    else:
-        query_idx = shuffled_argmax(classwise_mean, n_instances)
+        return multi_argmax(classwise_mean, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(classwise_mean, n_instances)


 def max_score(classifier: OneVsRestClassifier, X_pool: modALinput,
-              n_instances: int = 1, random_tie_break: bool = 1) -> Tuple[np.ndarray, modALinput]:
+              n_instances: int = 1, random_tie_break: bool = 1) -> np.ndarray:
    """
    MaxScore query strategy for multilabel classification.

@@ -231,15 +221,13 @@ def max_score(classifier: OneVsRestClassifier, X_pool: modALinput,
    classwise_max = np.max(classwise_scores, axis=1)

    if not random_tie_break:
-        query_idx = multi_argmax(classwise_max, n_instances)
-    else:
-        query_idx = shuffled_argmax(classwise_max, n_instances)
+        return multi_argmax(classwise_max, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(classwise_max, n_instances)


 def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
-              n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
+              n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
    """
    AvgScore query strategy for multilabel classification.

@@ -264,8 +252,6 @@ def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
    classwise_mean = np.mean(classwise_scores, axis=1)

    if not random_tie_break:
-        query_idx = multi_argmax(classwise_mean, n_instances)
-    else:
-        query_idx = shuffled_argmax(classwise_mean, n_instances)
+        return multi_argmax(classwise_mean, n_instances)

-    return query_idx, X_pool[query_idx]
+    return shuffled_argmax(classwise_mean, n_instances)
--- a/modAL/uncertainty.py
+++ b/modAL/uncertainty.py
@@ -132,7 +132,7 @@ def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba

 def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
                         n_instances: int = 1, random_tie_break: bool = False,
-                         **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                         **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Uncertainty sampling query strategy. Selects the least sure instances for labelling.

@@ -152,16 +152,14 @@ def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
    uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(uncertainty, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)
+        return multi_argmax(uncertainty, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(uncertainty, n_instances=n_instances)


 def margin_sampling(classifier: BaseEstimator, X: modALinput,
                    n_instances: int = 1, random_tie_break: bool = False,
-                    **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                    **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Margin sampling query strategy. Selects the instances where the difference between
    the first most likely and second most likely classes are the smallest.
@@ -180,16 +178,14 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput,
    margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(-margin, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(-margin, n_instances=n_instances)
+        return multi_argmax(-margin, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(-margin, n_instances=n_instances)


 def entropy_sampling(classifier: BaseEstimator, X: modALinput,
                     n_instances: int = 1, random_tie_break: bool = False,
-                     **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
+                     **uncertainty_measure_kwargs) -> np.ndarray:
    """
    Entropy sampling query strategy. Selects the instances where the class probabilities
    have the largest entropy.
@@ -210,8 +206,6 @@ def entropy_sampling(classifier: BaseEstimator, X: modALinput,
    entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)

    if not random_tie_break:
-        query_idx = multi_argmax(entropy, n_instances=n_instances)
-    else:
-        query_idx = shuffled_argmax(entropy, n_instances=n_instances)
+        return multi_argmax(entropy, n_instances=n_instances)

-    return query_idx, X[query_idx]
+    return shuffled_argmax(entropy, n_instances=n_instances)
--- a/modAL/utils/combination.py
+++ b/modAL/utils/combination.py
@@ -78,7 +78,6 @@ def make_query_strategy(utility_measure: Callable, selector: Callable) -> Callab
    """
    def query_strategy(classifier: BaseEstimator, X: modALinput) -> Tuple:
        utility = utility_measure(classifier, X)
-        query_idx = selector(utility)
-        return query_idx, X[query_idx]
+        return selector(utility)

    return query_strategy
--- a/modAL/utils/data.py
+++ b/modAL/utils/data.py
@@ -1,11 +1,12 @@
-from typing import Union, Container
+from typing import Union, Container, List
 from itertools import chain

 import numpy as np
+import pandas as pd
 import scipy.sparse as sp


-modALinput = Union[list, np.ndarray, sp.csr_matrix]
+modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame]


 def data_vstack(blocks: Container) -> modALinput:
@@ -24,8 +25,34 @@ def data_vstack(blocks: Container) -> modALinput:
        return list(chain(blocks))
    elif sp.issparse(blocks[0]):
        return sp.vstack(blocks)
+    elif isinstance(blocks[0], pd.DataFrame):
+        return blocks[0].append(blocks[1])
    else:
        try:
            return np.concatenate(blocks)
        except:
            raise TypeError('%s datatype is not supported' % type(blocks[0]))
+
+
+def retrieve_rows(X: modALinput,
+                  I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
+    """
+    Returns the rows I from the data set X
+    """
+    if isinstance(X, pd.DataFrame):
+        return X.iloc[I]
+
+    return X[I]
+
+def drop_rows(X: modALinput,
+              I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
+    if isinstance(X, pd.DataFrame):
+        return X.drop(I, axis=0)
+
+    return np.delete(X, I, axis=0)
+
+def enumerate_data(X: modALinput):
+    if isinstance(X, pd.DataFrame):
+        return X.iterrows()
+
+    return enumerate(X)
--- a/rtd_requirements.txt
+++ b/rtd_requirements.txt
@@ -3,3 +3,4 @@ scipy
 scikit-learn
 ipykernel
 nbsphinx
+pandas
--- a/setup.py
+++ b/setup.py
@@ -10,5 +10,5 @@ setup(
    url='https://modAL-python.github.io/',
    packages=['modAL', 'modAL.models', 'modAL.utils'],
    classifiers=['Development Status :: 4 - Beta'],
-    install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18'],
+    install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'],
 )
--- a/tests/core_tests.py
+++ b/tests/core_tests.py
@@ -140,8 +140,7 @@ class TestUtils(unittest.TestCase):
                query_1 = query_strategy(learner, X)
                query_2 = modAL.uncertainty.uncertainty_sampling(learner, X)

-                np.testing.assert_equal(query_1[0], query_2[0])
-                np.testing.assert_almost_equal(query_1[1], query_2[1])
+                np.testing.assert_equal(query_1, query_2)

    def test_data_vstack(self):
        for n_samples, n_features in product(range(1, 10), range(1, 10)):
@@ -560,10 +559,10 @@ class TestUncertainties(unittest.TestCase):
                    predict_proba = np.random.rand(n_samples, n_classes)
                    predict_proba[true_query_idx] = max_proba
                    classifier = mock.MockEstimator(predict_proba_return=predict_proba)
-                    query_idx, query_instance = modAL.uncertainty.uncertainty_sampling(
+                    query_idx = modAL.uncertainty.uncertainty_sampling(
                        classifier, np.random.rand(n_samples, n_classes)
                    )
-                    shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.uncertainty_sampling(
+                    shuffled_query_idx = modAL.uncertainty.uncertainty_sampling(
                        classifier, np.random.rand(n_samples, n_classes),
                        random_tie_break=True
                    )
@@ -577,10 +576,10 @@ class TestUncertainties(unittest.TestCase):
                    predict_proba[:, 0] = 1.0
                    predict_proba[true_query_idx, 0] = 0.0
                    classifier = mock.MockEstimator(predict_proba_return=predict_proba)
-                    query_idx, query_instance = modAL.uncertainty.margin_sampling(
+                    query_idx = modAL.uncertainty.margin_sampling(
                        classifier, np.random.rand(n_samples, n_classes)
                    )
-                    shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.margin_sampling(
+                    shuffled_query_idx = modAL.uncertainty.margin_sampling(
                        classifier, np.random.rand(n_samples, n_classes),
                        random_tie_break=True
                    )
@@ -595,10 +594,10 @@ class TestUncertainties(unittest.TestCase):
                    predict_proba[:, 0] = 1.0
                    predict_proba[true_query_idx] = max_proba
                    classifier = mock.MockEstimator(predict_proba_return=predict_proba)
-                    query_idx, query_instance = modAL.uncertainty.entropy_sampling(
+                    query_idx = modAL.uncertainty.entropy_sampling(
                        classifier, np.random.rand(n_samples, n_classes)
                    )
-                    shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.entropy_sampling(
+                    shuffled_query_idx = modAL.uncertainty.entropy_sampling(
                        classifier, np.random.rand(n_samples, n_classes),
                        random_tie_break=True
                    )
@@ -698,7 +697,7 @@ class TestActiveLearner(unittest.TestCase):
            for n_features in range(1, 10):
                X = np.random.rand(n_samples, n_features)
                query_idx = np.random.randint(0, n_samples)
-                mock_query = mock.MockFunction(return_val=(query_idx, X[query_idx]))
+                mock_query = mock.MockFunction(return_val=query_idx)
                learner = modAL.models.learners.ActiveLearner(
                    estimator=None,
                    query_strategy=mock_query
@@ -1107,4 +1106,3 @@ class TestExamples(unittest.TestCase):

 if __name__ == '__main__':
    unittest.main(verbosity=2)
-0
--- a/tests/example_tests/custom_query_strategies.py
+++ b/tests/example_tests/custom_query_strategies.py
@@ -42,8 +42,7 @@ product = make_product(
 # classifier uncertainty and classifier margin
 def custom_query_strategy(classifier, X, n_instances=1):
    utility = linear_combination(classifier, X)
-    query_idx = multi_argmax(utility, n_instances=n_instances)
-    return query_idx, X[query_idx]
+    return multi_argmax(utility, n_instances=n_instances)

 custom_query_learner = ActiveLearner(
    estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),