1
0
mirror of https://github.com/modAL-python/modAL.git synced 2022-05-17 00:31:33 +03:00

resolves #20, #104 - added pandas support and option for transforming data in learner

This commit is contained in:
Boyan Hristov
2020-09-24 18:12:59 +02:00
parent ff7a52f4cf
commit 8e0cb25029
22 changed files with 217 additions and 160 deletions

View File

@@ -100,12 +100,11 @@ import numpy as np
X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)
```
For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```.
For active learning, we shall define a custom query strategy tailored to Gaussian processes. In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```.
```python
def GP_regression_std(regressor, X):
_, std = regressor.predict(X, return_std=True)
query_idx = np.argmax(std)
return query_idx, X[query_idx]
return np.argmax(std)
```
After setting up the query strategy and the data, the active learner can be initialized.
```python

View File

@@ -70,7 +70,7 @@
"metadata": {},
"source": [
"## Uncertainty measure and query strategy for Gaussian processes\n",
"For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance and the instance itself. In our case, the arguments are ```regressor``` and ```X```."
"For active learning, we shall define a custom query strategy tailored to Gaussian processes. More information on how to write your custom query strategies can be found at the page [Extending modAL](https://cosmic-cortex.github.io/modAL/Extending-modAL). In a nutshell, a *query stategy* in modAL is a function taking (at least) two arguments (an estimator object and a pool of examples), outputting the index of the queried instance. In our case, the arguments are ```regressor``` and ```X```."
]
},
{
@@ -81,8 +81,7 @@
"source": [
"def GP_regression_std(regressor, X):\n",
" _, std = regressor.predict(X, return_std=True)\n",
" query_idx = np.argmax(std)\n",
" return query_idx, X[query_idx]"
" return np.argmax(std)"
]
},
{
@@ -234,4 +233,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@@ -27,11 +27,8 @@
" # measure the utility of each instance in the pool\n",
" utility = utility_measure(classifier, X)\n",
"\n",
" # select the indices of the instances to be queried\n",
" query_idx = select_instances(utility)\n",
"\n",
" # return the indices and the instances\n",
" return query_idx, X[query_idx]"
" # select and return the indices of the instances to be queried\n",
" return select_instances(utility)"
]
},
{
@@ -213,8 +210,7 @@
"# classifier uncertainty and classifier margin\n",
"def custom_query_strategy(classifier, X, n_instances=1):\n",
" utility = linear_combination(classifier, X)\n",
" query_idx = multi_argmax(utility, n_instances=n_instances)\n",
" return query_idx, X[query_idx]\n",
" return multi_argmax(utility, n_instances=n_instances)\n",
"\n",
"custom_query_learner = ActiveLearner(\n",
" estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),\n",
@@ -299,4 +295,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

View File

@@ -118,15 +118,13 @@ the *noisy sine* function:
For active learning, we shall define a custom query strategy tailored to
Gaussian processes. In a nutshell, a *query stategy* in modAL is a
function taking (at least) two arguments (an estimator object and a pool
of examples), outputting the index of the queried instance and the
instance itself. In our case, the arguments are ``regressor`` and ``X``.
of examples), outputting the index of the queried instance. In our case, the arguments are ``regressor`` and ``X``.
.. code:: python
def GP_regression_std(regressor, X):
_, std = regressor.predict(X, return_std=True)
query_idx = np.argmax(std)
return query_idx, X[query_idx]
return np.argmax(std)
After setting up the query strategy and the data, the active learner can
be initialized.

View File

@@ -12,8 +12,7 @@ from modAL.models import ActiveLearner
# query strategy for regression
def GP_regression_std(regressor, X):
_, std = regressor.predict(X, return_std=True)
query_idx = np.argmax(std)
return query_idx, X[query_idx]
return np.argmax(std)
# generating the data

View File

@@ -5,18 +5,16 @@ Template for query strategies
The first two arguments of a query strategy function is always the estimator and the pool
of instances to be queried from. Additional arguments are accepted as keyword arguments.
A valid query strategy function always returns a tuple of the indices of the queried
instances and the instances themselves.
A valid query strategy function always returns indices of the queried
instances.
def custom_query_strategy(classifier, X, a_keyword_argument=42):
# measure the utility of each instance in the pool
utility = utility_measure(classifier, X)
# select the indices of the instances to be queried
query_idx = select_instances(utility)
# select and return the indices of the instances to be queried
return select_instances(utility)
# return the indices and the instances
return query_idx, X[query_idx]
This function can be used in the active learning workflow.
@@ -97,8 +95,7 @@ with plt.style.context('seaborn-white'):
# classifier uncertainty and classifier margin
def custom_query_strategy(classifier, X, n_instances=1):
utility = linear_combination(classifier, X)
query_idx = multi_argmax(utility, n_instances=n_instances)
return query_idx, X[query_idx]
return multi_argmax(utility, n_instances=n_instances)
custom_query_learner = ActiveLearner(
estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),

View File

@@ -62,12 +62,10 @@ def max_entropy(learner, X, n_instances=1, T=100):
expected_p = np.mean(MC_samples, axis=0)
acquisition = - np.sum(expected_p * np.log(expected_p + 1e-10), axis=-1) # [batch size]
idx = (-acquisition).argsort()[:n_instances]
query_idx = random_subset[idx]
return query_idx, X[query_idx]
return random_subset[idx]
def uniform(learner, X, n_instances=1):
query_idx = np.random.choice(range(len(X)), size=n_instances, replace=False)
return query_idx, X[query_idx]
return np.random.choice(range(len(X)), size=n_instances, replace=False)
"""
Training the ActiveLearner

View File

@@ -57,8 +57,7 @@ final_prediction = learner.predict_proba(X_full)[:, 1].reshape(im_height, im_wid
def random_sampling(classsifier, X):
query_idx = np.random.randint(len(X))
return query_idx, X[query_idx]
return np.random.randint(len(X))
X_pool = deepcopy(X_full)

View File

@@ -104,7 +104,7 @@ Query strategies using acquisition functions
def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1) -> np.ndarray:
"""
Maximum PI query strategy. Selects the instance with highest probability of improvement.
@@ -118,13 +118,11 @@ def max_PI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
"""
pi = optimizer_PI(optimizer, X, tradeoff=tradeoff)
query_idx = multi_argmax(pi, n_instances=n_instances)
return query_idx, X[query_idx]
return multi_argmax(pi, n_instances=n_instances)
def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1) -> np.ndarray:
"""
Maximum EI query strategy. Selects the instance with highest expected improvement.
@@ -138,13 +136,11 @@ def max_EI(optimizer: BaseLearner, X: modALinput, tradeoff: float = 0,
The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
"""
ei = optimizer_EI(optimizer, X, tradeoff=tradeoff)
query_idx = multi_argmax(ei, n_instances=n_instances)
return query_idx, X[query_idx]
return multi_argmax(ei, n_instances=n_instances)
def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
n_instances: int = 1) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1) -> np.ndarray:
"""
Maximum UCB query strategy. Selects the instance with highest upper confidence bound.
@@ -158,6 +154,4 @@ def max_UCB(optimizer: BaseLearner, X: modALinput, beta: float = 1,
The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
"""
ucb = optimizer_UCB(optimizer, X, beta=beta)
query_idx = multi_argmax(ucb, n_instances=n_instances)
return query_idx, X[query_idx]
return multi_argmax(ucb, n_instances=n_instances)

View File

@@ -114,7 +114,7 @@ def select_instance(
unlabeled_indices = [i for i in range(n_pool) if mask[i]]
best_instance_index = unlabeled_indices[best_instance_index_in_unlabeled]
mask[best_instance_index] = 0
return best_instance_index, np.expand_dims(X_pool[best_instance_index], axis=0), mask
return best_instance_index, X_pool[[best_instance_index]], mask
def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
@@ -142,11 +142,16 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
"""
# Make a local copy of our classifier's training data.
# Define our record container and record the best cold start instance in the case of cold start.
# transform unlabeled data if needed
if classifier.on_transformed:
unlabeled = classifier.transform_without_estimating(unlabeled)
if classifier.X_training is None:
best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
instance_index_ranking = [best_coldstart_instance_index]
elif classifier.X_training.shape[0] > 0:
labeled = classifier.X_training[:]
labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
instance_index_ranking = []
# The maximum number of records to sample.
@@ -180,7 +185,7 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
metric: Union[str, Callable] = 'euclidean',
n_jobs: Optional[int] = None,
**uncertainty_measure_kwargs
) -> Tuple[np.ndarray, Union[np.ndarray, sp.csr_matrix]]:
) -> np.ndarray:
"""
Batch sampling query strategy. Selects the least sure instances for labelling.
@@ -206,6 +211,6 @@ def uncertainty_batch_sampling(classifier: Union[BaseLearner, BaseCommittee],
Indices of the instances from `X` chosen to be labelled; records from `X` chosen to be labelled.
"""
uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
query_indices = ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
return ranked_batch(classifier, unlabeled=X, uncertainty_scores=uncertainty,
n_instances=n_instances, metric=metric, n_jobs=n_jobs)
return query_indices, X[query_indices]

View File

@@ -104,7 +104,7 @@ def KL_max_disagreement(committee: BaseCommittee, X: modALinput, **predict_proba
def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
n_instances: int = 1, random_tie_break=False,
**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**disagreement_measure_kwargs) -> np.ndarray:
"""
Vote entropy sampling strategy.
@@ -124,16 +124,14 @@ def vote_entropy_sampling(committee: BaseCommittee, X: modALinput,
disagreement = vote_entropy(committee, X, **disagreement_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(disagreement, n_instances=n_instances)
else:
query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
return multi_argmax(disagreement, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(disagreement, n_instances=n_instances)
def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
n_instances: int = 1, random_tie_break=False,
**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**disagreement_measure_kwargs) -> np.ndarray:
"""
Consensus entropy sampling strategy.
@@ -153,16 +151,14 @@ def consensus_entropy_sampling(committee: BaseCommittee, X: modALinput,
disagreement = consensus_entropy(committee, X, **disagreement_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(disagreement, n_instances=n_instances)
else:
query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
return multi_argmax(disagreement, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(disagreement, n_instances=n_instances)
def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
n_instances: int = 1, random_tie_break=False,
**disagreement_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**disagreement_measure_kwargs) -> np.ndarray:
"""
Maximum disagreement sampling strategy.
@@ -182,16 +178,14 @@ def max_disagreement_sampling(committee: BaseCommittee, X: modALinput,
disagreement = KL_max_disagreement(committee, X, **disagreement_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(disagreement, n_instances=n_instances)
else:
query_idx = shuffled_argmax(disagreement, n_instances=n_instances)
return multi_argmax(disagreement, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(disagreement, n_instances=n_instances)
def max_std_sampling(regressor: BaseEstimator, X: modALinput,
n_instances: int = 1, random_tie_break=False,
**predict_kwargs) -> Tuple[np.ndarray, modALinput]:
**predict_kwargs) -> np.ndarray:
"""
Regressor standard deviation sampling strategy.
@@ -211,8 +205,6 @@ def max_std_sampling(regressor: BaseEstimator, X: modALinput,
std = std.reshape(X.shape[0], )
if not random_tie_break:
query_idx = multi_argmax(std, n_instances=n_instances)
else:
query_idx = shuffled_argmax(std, n_instances=n_instances)
return multi_argmax(std, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(std, n_instances=n_instances)

View File

@@ -10,14 +10,14 @@ from sklearn.base import clone
from sklearn.exceptions import NotFittedError
from modAL.models import ActiveLearner
from modAL.utils.data import modALinput, data_vstack
from modAL.utils.data import modALinput, data_vstack, enumerate_data, drop_rows
from modAL.utils.selection import multi_argmax, shuffled_argmax
from modAL.uncertainty import _proba_uncertainty, _proba_entropy
def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str = 'binary',
p_subsample: np.float = 1.0, n_instances: int = 1,
random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
random_tie_break: bool = False) -> np.ndarray:
"""
Expected error reduction query strategy.
@@ -52,17 +52,17 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
X_proba = learner.predict_proba(X)
except NotFittedError:
# TODO: implement a proper cold-start
return 0, X[0]
return np.array([0])
cloned_estimator = clone(learner.estimator)
for x_idx, x in enumerate(X):
for x_idx, x in enumerate_data(X):
# subsample the data if needed
if np.random.rand() <= p_subsample:
X_reduced = np.delete(X, x_idx, axis=0)
X_reduced = drop_rows(X, x_idx)
# estimate the expected error
for y_idx, y in enumerate(possible_labels):
X_new = data_vstack((learner.X_training, np.expand_dims(x, axis=0)))
X_new = data_vstack((learner.X_training, [x]))
y_new = data_vstack((learner.y_training, np.array(y).reshape(1,)))
cloned_estimator.fit(X_new, y_new)
@@ -78,8 +78,6 @@ def expected_error_reduction(learner: ActiveLearner, X: modALinput, loss: str =
expected_error[x_idx] = np.inf
if not random_tie_break:
query_idx = multi_argmax(-expected_error, n_instances)
else:
query_idx = shuffled_argmax(-expected_error, n_instances)
return multi_argmax(-expected_error, n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(-expected_error, n_instances)

View File

@@ -5,14 +5,18 @@ Base classes for active learning algorithms
import abc
import sys
import warnings
from typing import Union, Callable, Optional, Tuple, List, Iterator, Any
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.ensemble._base import _BaseHeterogeneousEnsemble
from sklearn.pipeline import Pipeline
from sklearn.utils import check_X_y
from modAL.utils.data import data_vstack, modALinput
import scipy.sparse as sp
from modAL.utils.data import data_vstack, modALinput, retrieve_rows
if sys.version_info >= (3, 4):
ABC = abc.ABC
@@ -34,6 +38,8 @@ class BaseLearner(ABC, BaseEstimator):
When False, accepts np.nan and np.inf values.
bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
Useful when building Committee models with bagging.
on_transformed: Whether to transform samples with the pipeline defined by the estimator
when applying the query strategy.
**fit_kwargs: keyword arguments.
Attributes:
@@ -49,6 +55,7 @@ class BaseLearner(ABC, BaseEstimator):
X_training: Optional[modALinput] = None,
y_training: Optional[modALinput] = None,
bootstrap_init: bool = False,
on_transformed: bool = False,
force_all_finite: bool = True,
**fit_kwargs
) -> None:
@@ -56,11 +63,14 @@ class BaseLearner(ABC, BaseEstimator):
self.estimator = estimator
self.query_strategy = query_strategy
self.on_transformed = on_transformed
self.X_training = X_training
self.Xt_training = None
self.y_training = y_training
if X_training is not None:
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
self.force_all_finite = force_all_finite
@@ -82,15 +92,65 @@ class BaseLearner(ABC, BaseEstimator):
if self.X_training is None:
self.X_training = X
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
self.y_training = y
else:
try:
self.X_training = data_vstack((self.X_training, X))
self.Xt_training = data_vstack((
self.Xt_training,
self.transform_without_estimating(X)
)) if self.on_transformed else None
self.y_training = data_vstack((self.y_training, y))
except ValueError:
raise ValueError('the dimensions of the new training data and label must'
'agree with the training data and labels provided so far')
def transform_without_estimating(self, X: modALinput) -> Union[np.ndarray, sp.csr_matrix]:
"""
Transforms the data as supplied to the estimator.
* In case the estimator is an skearn pipeline, it applies all pipeline components but the last one.
* In case the estimator is an ensemble, it concatenates the transformations for each classfier
(pipeline) in the ensemble.
* Otherwise returns the non-transformed dataset X
Args:
X: dataset to be transformed
Returns:
Transformed data set
"""
Xt = []
pipes = [self.estimator]
if isinstance(self.estimator, _BaseHeterogeneousEnsemble):
pipes = self.estimator.estimators_
################################
# transform data with pipelines used by estimator
for pipe in pipes:
if isinstance(pipe, Pipeline):
# NOTE: The used pipeline class might be an extension to sklearn's!
# Create a new instance of the used pipeline class with all
# components but the final estimator.
transformation_pipe = pipe.__class__(steps=pipe.steps[:-1])
Xt.append(transformation_pipe.transform(X))
# in case no transformation pipelines are used by the estimator,
# return the original, non-transfored data
if not Xt:
return X
################################
# concatenate all transformations and return
# TODO: maybe use a newly implemented data_hstack() instead
# use sparse representation if any of the pipelines do
if any([isinstance(Xti, sp.csr_matrix) for Xti in Xt]):
return sp.hstack([sp.csc_matrix(Xti) for Xti in Xt])
return np.hstack(Xt)
def _fit_to_known(self, bootstrap: bool = False, **fit_kwargs) -> 'BaseLearner':
"""
Fits self.estimator to the training data and labels provided to it so far.
@@ -185,11 +245,12 @@ class BaseLearner(ABC, BaseEstimator):
"""
return self.estimator.predict_proba(X, **predict_proba_kwargs)
def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
"""
Finds the n_instances most informative point in the data provided by calling the query_strategy function.
Args:
X_pool: Pool of unlabeled instances to retrieve most informative instances from
*query_args: The arguments for the query strategy. For instance, in the case of
:func:`~modAL.uncertainty.uncertainty_sampling`, it is the pool of samples from which the query strategy
should choose instances to request labels.
@@ -200,8 +261,15 @@ class BaseLearner(ABC, BaseEstimator):
labelled and the instances themselves. Can be different in other cases, for instance only the instance to be
labelled upon query synthesis.
"""
query_result = self.query_strategy(self, *query_args, **query_kwargs)
return query_result
query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs)
if isinstance(query_result, tuple):
warnings.warn("Query strategies should no longer return the selected instances, "
"this is now handled by the query method. "
"Please return only the indices of the selected instances.", DeprecationWarning)
return query_result
return query_result, retrieve_rows(X_pool, query_result)
def score(self, X: modALinput, y: modALinput, **score_kwargs) -> Any:
"""
@@ -301,11 +369,12 @@ class BaseCommittee(ABC, BaseEstimator):
return self
def query(self, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
def query(self, X_pool, *query_args, **query_kwargs) -> Union[Tuple, modALinput]:
"""
Finds the n_instances most informative point in the data provided by calling the query_strategy function.
Args:
X_pool: Pool of unlabeled instances to retrieve most informative instances from
*query_args: The arguments for the query strategy. For instance, in the case of
:func:`~modAL.disagreement.max_disagreement_sampling`, it is the pool of samples from which the query.
strategy should choose instances to request labels.
@@ -316,8 +385,15 @@ class BaseCommittee(ABC, BaseEstimator):
be labelled and the instances themselves. Can be different in other cases, for instance only the instance to
be labelled upon query synthesis.
"""
query_result = self.query_strategy(self, *query_args, **query_kwargs)
return query_result
query_result = self.query_strategy(self, X_pool, *query_args, **query_kwargs)
if isinstance(query_result, tuple):
warnings.warn("Query strategies should no longer return the selected instances, "
"this is now handled by the query method. "
"Please return only the indices of the selected instances", DeprecationWarning)
return query_result
return query_result, retrieve_rows(X_pool, query_result)
def rebag(self, **fit_kwargs) -> None:
"""

View File

@@ -30,6 +30,8 @@ class ActiveLearner(BaseLearner):
y_training: Initial training labels corresponding to initial training samples.
bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
Useful when building Committee models with bagging.
on_transformed: Whether to transform samples with the pipeline defined by the estimator
when applying the query strategy.
**fit_kwargs: keyword arguments.
Attributes:
@@ -73,10 +75,11 @@ class ActiveLearner(BaseLearner):
X_training: Optional[modALinput] = None,
y_training: Optional[modALinput] = None,
bootstrap_init: bool = False,
on_transformed: bool = False,
**fit_kwargs
) -> None:
super().__init__(estimator, query_strategy,
X_training, y_training, bootstrap_init, **fit_kwargs)
X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs)
def teach(self, X: modALinput, y: modALinput, bootstrap: bool = False, only_new: bool = False, **fit_kwargs) -> None:
"""
@@ -177,9 +180,10 @@ class BayesianOptimizer(BaseLearner):
X_training: Optional[modALinput] = None,
y_training: Optional[modALinput] = None,
bootstrap_init: bool = False,
on_transformed: bool = False,
**fit_kwargs) -> None:
super(BayesianOptimizer, self).__init__(estimator, query_strategy,
X_training, y_training, bootstrap_init, **fit_kwargs)
X_training, y_training, bootstrap_init, on_transformed, **fit_kwargs)
# setting the maximum value
if self.y_training is not None:
max_idx = np.argmax(self.y_training)
@@ -481,8 +485,7 @@ class CommitteeRegressor(BaseCommittee):
>>> # query strategy for regression
>>> def ensemble_regression_std(regressor, X):
... _, std = regressor.predict(X, return_std=True)
... query_idx = np.argmax(std)
... return query_idx, X[query_idx]
... return np.argmax(std)
>>>
>>> # initializing the CommitteeRegressor
>>> committee = CommitteeRegressor(

View File

@@ -43,7 +43,7 @@ def _SVM_loss(multiclass_classifier: ActiveLearner,
def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
random_tie_break: bool = False) -> np.ndarray:
"""
SVM binary minimum multilabel active learning strategy. For details see the paper
Klaus Brinker, On Active Learning in Multi-label Classification
@@ -67,15 +67,13 @@ def SVM_binary_minimum(classifier: ActiveLearner, X_pool: modALinput,
min_abs_dist = np.min(np.abs(decision_function), axis=1)
if not random_tie_break:
query_idx = np.argmin(min_abs_dist)
else:
query_idx = shuffled_argmax(min_abs_dist)
return np.argmin(min_abs_dist)
return query_idx, X_pool[query_idx]
return shuffled_argmax(min_abs_dist)
def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
"""
Max Loss query strategy for SVM multilabel classification.
@@ -103,15 +101,13 @@ def max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
loss = _SVM_loss(classifier, X_pool, most_certain_classes=most_certain_classes)
if not random_tie_break:
query_idx = multi_argmax(loss, n_instances)
else:
query_idx = shuffled_argmax(loss, n_instances)
return multi_argmax(loss, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(loss, n_instances)
def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
"""
Mean Max Loss query strategy for SVM multilabel classification.
@@ -136,15 +132,13 @@ def mean_max_loss(classifier: OneVsRestClassifier, X_pool: modALinput,
loss = _SVM_loss(classifier, X_pool)
if not random_tie_break:
query_idx = multi_argmax(loss, n_instances)
else:
query_idx = shuffled_argmax(loss, n_instances)
return multi_argmax(loss, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(loss, n_instances)
def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
"""
MinConfidence query strategy for multilabel classification.
@@ -167,15 +161,13 @@ def min_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
classwise_min = np.min(classwise_confidence, axis=1)
if not random_tie_break:
query_idx = multi_argmax(-classwise_min, n_instances)
else:
query_idx = shuffled_argmax(-classwise_min, n_instances)
return multi_argmax(-classwise_min, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(-classwise_min, n_instances)
def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
"""
AvgConfidence query strategy for multilabel classification.
@@ -198,15 +190,13 @@ def avg_confidence(classifier: OneVsRestClassifier, X_pool: modALinput,
classwise_mean = np.mean(classwise_confidence, axis=1)
if not random_tie_break:
query_idx = multi_argmax(classwise_mean, n_instances)
else:
query_idx = shuffled_argmax(classwise_mean, n_instances)
return multi_argmax(classwise_mean, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(classwise_mean, n_instances)
def max_score(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = 1) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = 1) -> np.ndarray:
"""
MaxScore query strategy for multilabel classification.
@@ -231,15 +221,13 @@ def max_score(classifier: OneVsRestClassifier, X_pool: modALinput,
classwise_max = np.max(classwise_scores, axis=1)
if not random_tie_break:
query_idx = multi_argmax(classwise_max, n_instances)
else:
query_idx = shuffled_argmax(classwise_max, n_instances)
return multi_argmax(classwise_max, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(classwise_max, n_instances)
def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
n_instances: int = 1, random_tie_break: bool = False) -> Tuple[np.ndarray, modALinput]:
n_instances: int = 1, random_tie_break: bool = False) -> np.ndarray:
"""
AvgScore query strategy for multilabel classification.
@@ -264,8 +252,6 @@ def avg_score(classifier: OneVsRestClassifier, X_pool: modALinput,
classwise_mean = np.mean(classwise_scores, axis=1)
if not random_tie_break:
query_idx = multi_argmax(classwise_mean, n_instances)
else:
query_idx = shuffled_argmax(classwise_mean, n_instances)
return multi_argmax(classwise_mean, n_instances)
return query_idx, X_pool[query_idx]
return shuffled_argmax(classwise_mean, n_instances)

View File

@@ -132,7 +132,7 @@ def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba
def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
n_instances: int = 1, random_tie_break: bool = False,
**uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**uncertainty_measure_kwargs) -> np.ndarray:
"""
Uncertainty sampling query strategy. Selects the least sure instances for labelling.
@@ -152,16 +152,14 @@ def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(uncertainty, n_instances=n_instances)
else:
query_idx = shuffled_argmax(uncertainty, n_instances=n_instances)
return multi_argmax(uncertainty, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(uncertainty, n_instances=n_instances)
def margin_sampling(classifier: BaseEstimator, X: modALinput,
n_instances: int = 1, random_tie_break: bool = False,
**uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**uncertainty_measure_kwargs) -> np.ndarray:
"""
Margin sampling query strategy. Selects the instances where the difference between
the first most likely and second most likely classes are the smallest.
@@ -180,16 +178,14 @@ def margin_sampling(classifier: BaseEstimator, X: modALinput,
margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(-margin, n_instances=n_instances)
else:
query_idx = shuffled_argmax(-margin, n_instances=n_instances)
return multi_argmax(-margin, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(-margin, n_instances=n_instances)
def entropy_sampling(classifier: BaseEstimator, X: modALinput,
n_instances: int = 1, random_tie_break: bool = False,
**uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
**uncertainty_measure_kwargs) -> np.ndarray:
"""
Entropy sampling query strategy. Selects the instances where the class probabilities
have the largest entropy.
@@ -210,8 +206,6 @@ def entropy_sampling(classifier: BaseEstimator, X: modALinput,
entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)
if not random_tie_break:
query_idx = multi_argmax(entropy, n_instances=n_instances)
else:
query_idx = shuffled_argmax(entropy, n_instances=n_instances)
return multi_argmax(entropy, n_instances=n_instances)
return query_idx, X[query_idx]
return shuffled_argmax(entropy, n_instances=n_instances)

View File

@@ -78,7 +78,6 @@ def make_query_strategy(utility_measure: Callable, selector: Callable) -> Callab
"""
def query_strategy(classifier: BaseEstimator, X: modALinput) -> Tuple:
utility = utility_measure(classifier, X)
query_idx = selector(utility)
return query_idx, X[query_idx]
return selector(utility)
return query_strategy

View File

@@ -1,11 +1,12 @@
from typing import Union, Container
from typing import Union, Container, List
from itertools import chain
import numpy as np
import pandas as pd
import scipy.sparse as sp
modALinput = Union[list, np.ndarray, sp.csr_matrix]
modALinput = Union[list, np.ndarray, sp.csr_matrix, pd.DataFrame]
def data_vstack(blocks: Container) -> modALinput:
@@ -24,8 +25,34 @@ def data_vstack(blocks: Container) -> modALinput:
return list(chain(blocks))
elif sp.issparse(blocks[0]):
return sp.vstack(blocks)
elif isinstance(blocks[0], pd.DataFrame):
return blocks[0].append(blocks[1])
else:
try:
return np.concatenate(blocks)
except:
raise TypeError('%s datatype is not supported' % type(blocks[0]))
def retrieve_rows(X: modALinput,
I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
"""
Returns the rows I from the data set X
"""
if isinstance(X, pd.DataFrame):
return X.iloc[I]
return X[I]
def drop_rows(X: modALinput,
I: Union[int, List[int], np.ndarray]) -> Union[sp.csc_matrix, np.ndarray, pd.DataFrame]:
if isinstance(X, pd.DataFrame):
return X.drop(I, axis=0)
return np.delete(X, I, axis=0)
def enumerate_data(X: modALinput):
if isinstance(X, pd.DataFrame):
return X.iterrows()
return enumerate(X)

View File

@@ -3,3 +3,4 @@ scipy
scikit-learn
ipykernel
nbsphinx
pandas

View File

@@ -10,5 +10,5 @@ setup(
url='https://modAL-python.github.io/',
packages=['modAL', 'modAL.models', 'modAL.utils'],
classifiers=['Development Status :: 4 - Beta'],
install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18'],
install_requires=['numpy>=1.13', 'scikit-learn>=0.18', 'scipy>=0.18', 'pandas>=1.1.0'],
)

View File

@@ -140,8 +140,7 @@ class TestUtils(unittest.TestCase):
query_1 = query_strategy(learner, X)
query_2 = modAL.uncertainty.uncertainty_sampling(learner, X)
np.testing.assert_equal(query_1[0], query_2[0])
np.testing.assert_almost_equal(query_1[1], query_2[1])
np.testing.assert_equal(query_1, query_2)
def test_data_vstack(self):
for n_samples, n_features in product(range(1, 10), range(1, 10)):
@@ -560,10 +559,10 @@ class TestUncertainties(unittest.TestCase):
predict_proba = np.random.rand(n_samples, n_classes)
predict_proba[true_query_idx] = max_proba
classifier = mock.MockEstimator(predict_proba_return=predict_proba)
query_idx, query_instance = modAL.uncertainty.uncertainty_sampling(
query_idx = modAL.uncertainty.uncertainty_sampling(
classifier, np.random.rand(n_samples, n_classes)
)
shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.uncertainty_sampling(
shuffled_query_idx = modAL.uncertainty.uncertainty_sampling(
classifier, np.random.rand(n_samples, n_classes),
random_tie_break=True
)
@@ -577,10 +576,10 @@ class TestUncertainties(unittest.TestCase):
predict_proba[:, 0] = 1.0
predict_proba[true_query_idx, 0] = 0.0
classifier = mock.MockEstimator(predict_proba_return=predict_proba)
query_idx, query_instance = modAL.uncertainty.margin_sampling(
query_idx = modAL.uncertainty.margin_sampling(
classifier, np.random.rand(n_samples, n_classes)
)
shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.margin_sampling(
shuffled_query_idx = modAL.uncertainty.margin_sampling(
classifier, np.random.rand(n_samples, n_classes),
random_tie_break=True
)
@@ -595,10 +594,10 @@ class TestUncertainties(unittest.TestCase):
predict_proba[:, 0] = 1.0
predict_proba[true_query_idx] = max_proba
classifier = mock.MockEstimator(predict_proba_return=predict_proba)
query_idx, query_instance = modAL.uncertainty.entropy_sampling(
query_idx = modAL.uncertainty.entropy_sampling(
classifier, np.random.rand(n_samples, n_classes)
)
shuffled_query_idx, shuffled_query_instance = modAL.uncertainty.entropy_sampling(
shuffled_query_idx = modAL.uncertainty.entropy_sampling(
classifier, np.random.rand(n_samples, n_classes),
random_tie_break=True
)
@@ -698,7 +697,7 @@ class TestActiveLearner(unittest.TestCase):
for n_features in range(1, 10):
X = np.random.rand(n_samples, n_features)
query_idx = np.random.randint(0, n_samples)
mock_query = mock.MockFunction(return_val=(query_idx, X[query_idx]))
mock_query = mock.MockFunction(return_val=query_idx)
learner = modAL.models.learners.ActiveLearner(
estimator=None,
query_strategy=mock_query
@@ -1107,4 +1106,3 @@ class TestExamples(unittest.TestCase):
if __name__ == '__main__':
unittest.main(verbosity=2)
0

View File

@@ -42,8 +42,7 @@ product = make_product(
# classifier uncertainty and classifier margin
def custom_query_strategy(classifier, X, n_instances=1):
utility = linear_combination(classifier, X)
query_idx = multi_argmax(utility, n_instances=n_instances)
return query_idx, X[query_idx]
return multi_argmax(utility, n_instances=n_instances)
custom_query_learner = ActiveLearner(
estimator=GaussianProcessClassifier(1.0 * RBF(1.0)),