mirror of
https://github.com/modAL-python/modAL.git
synced 2022-05-17 00:31:33 +03:00
FIX #108 - no longer storing transformed training data for on_transformed strategies
This commit is contained in:
@@ -8,7 +8,7 @@ import numpy as np
|
||||
import scipy.sparse as sp
|
||||
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min
|
||||
|
||||
from modAL.utils.data import data_vstack, modALinput
|
||||
from modAL.utils.data import data_vstack, modALinput, data_shape
|
||||
from modAL.models.base import BaseCommittee, BaseLearner
|
||||
from modAL.uncertainty import classifier_uncertainty
|
||||
|
||||
@@ -150,8 +150,10 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
|
||||
if classifier.X_training is None:
|
||||
best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
|
||||
instance_index_ranking = [best_coldstart_instance_index]
|
||||
elif classifier.X_training.shape[0] > 0:
|
||||
labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
|
||||
elif data_shape(classifier.X_training)[0] > 0:
|
||||
labeled = classifier.transform_without_estimating(
|
||||
classifier.X_training
|
||||
) if classifier.on_transformed else classifier.X_training[:]
|
||||
instance_index_ranking = []
|
||||
|
||||
# The maximum number of records to sample.
|
||||
|
||||
@@ -66,11 +66,9 @@ class BaseLearner(ABC, BaseEstimator):
|
||||
self.on_transformed = on_transformed
|
||||
|
||||
self.X_training = X_training
|
||||
self.Xt_training = None
|
||||
self.y_training = y_training
|
||||
if X_training is not None:
|
||||
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
|
||||
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
|
||||
|
||||
assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
|
||||
self.force_all_finite = force_all_finite
|
||||
@@ -92,15 +90,10 @@ class BaseLearner(ABC, BaseEstimator):
|
||||
|
||||
if self.X_training is None:
|
||||
self.X_training = X
|
||||
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
|
||||
self.y_training = y
|
||||
else:
|
||||
try:
|
||||
self.X_training = data_vstack((self.X_training, X))
|
||||
self.Xt_training = data_vstack((
|
||||
self.Xt_training,
|
||||
self.transform_without_estimating(X)
|
||||
)) if self.on_transformed else None
|
||||
self.y_training = data_vstack((self.y_training, y))
|
||||
except ValueError:
|
||||
raise ValueError('the dimensions of the new training data and label must'
|
||||
@@ -213,7 +206,6 @@ class BaseLearner(ABC, BaseEstimator):
|
||||
check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
|
||||
force_all_finite=self.force_all_finite)
|
||||
self.X_training, self.y_training = X, y
|
||||
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
|
||||
return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
|
||||
|
||||
def predict(self, X: modALinput, **predict_kwargs) -> Any:
|
||||
|
||||
@@ -29,6 +29,7 @@ from sklearn.svm import SVC
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.preprocessing import FunctionTransformer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from scipy.stats import entropy, norm
|
||||
from scipy.special import ndtr
|
||||
from scipy import sparse as sp
|
||||
@@ -824,6 +825,45 @@ class TestActiveLearner(unittest.TestCase):
|
||||
query_idx, query_inst = learner.query(X_pool)
|
||||
learner.teach(X_pool.iloc[query_idx], y_pool[query_idx])
|
||||
|
||||
def test_on_transformed_with_variable_transformation(self):
|
||||
"""
|
||||
Learnable transformations naturally change after a model is retrained. Make sure this is handled
|
||||
properly for on_transformed=True query strategies.
|
||||
"""
|
||||
query_strategies = [
|
||||
modAL.batch.uncertainty_batch_sampling
|
||||
# add further strategies which work with instance representations
|
||||
# no further ones as of 09.12.2020
|
||||
]
|
||||
|
||||
X_labeled = ['Dog', 'Cat', 'Tree']
|
||||
|
||||
# contains unseen in labeled words, training model on those
|
||||
# will alter CountVectorizer transformations
|
||||
X_pool = ['Airplane', 'House']
|
||||
|
||||
y = [0, 1, 1, 0, 1] # irrelevant for test
|
||||
|
||||
for query_strategy in query_strategies:
|
||||
learner = modAL.models.learners.ActiveLearner(
|
||||
estimator=make_pipeline(
|
||||
CountVectorizer(),
|
||||
RandomForestClassifier(n_estimators=10)
|
||||
),
|
||||
query_strategy=query_strategy,
|
||||
X_training=X_labeled, y_training=y[:len(X_labeled)],
|
||||
on_transformed=True,
|
||||
)
|
||||
|
||||
for _ in range(len(X_pool)):
|
||||
query_idx, query_instance = learner.query(X_pool, n_instances=1)
|
||||
i = query_idx[0]
|
||||
|
||||
learner.teach(
|
||||
X=[X_pool[i]],
|
||||
y=[y[i]]
|
||||
)
|
||||
|
||||
def test_old_query_strategy_interface(self):
|
||||
n_samples = 10
|
||||
n_features = 5
|
||||
|
||||
Reference in New Issue
Block a user