1
0
mirror of https://github.com/modAL-python/modAL.git synced 2022-05-17 00:31:33 +03:00

FIX #108 - no longer storing transformed training data for on_transformed strategies

This commit is contained in:
Boyan Hristov
2020-12-09 16:45:06 +01:00
parent 4254df9918
commit 47bc726ffd
3 changed files with 45 additions and 11 deletions

View File

@@ -8,7 +8,7 @@ import numpy as np
import scipy.sparse as sp
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min
from modAL.utils.data import data_vstack, modALinput
from modAL.utils.data import data_vstack, modALinput, data_shape
from modAL.models.base import BaseCommittee, BaseLearner
from modAL.uncertainty import classifier_uncertainty
@@ -150,8 +150,10 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
if classifier.X_training is None:
best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
instance_index_ranking = [best_coldstart_instance_index]
elif classifier.X_training.shape[0] > 0:
labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
elif data_shape(classifier.X_training)[0] > 0:
labeled = classifier.transform_without_estimating(
classifier.X_training
) if classifier.on_transformed else classifier.X_training[:]
instance_index_ranking = []
# The maximum number of records to sample.

View File

@@ -66,11 +66,9 @@ class BaseLearner(ABC, BaseEstimator):
self.on_transformed = on_transformed
self.X_training = X_training
self.Xt_training = None
self.y_training = y_training
if X_training is not None:
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
self.force_all_finite = force_all_finite
@@ -92,15 +90,10 @@ class BaseLearner(ABC, BaseEstimator):
if self.X_training is None:
self.X_training = X
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
self.y_training = y
else:
try:
self.X_training = data_vstack((self.X_training, X))
self.Xt_training = data_vstack((
self.Xt_training,
self.transform_without_estimating(X)
)) if self.on_transformed else None
self.y_training = data_vstack((self.y_training, y))
except ValueError:
raise ValueError('the dimensions of the new training data and label must'
@@ -213,7 +206,6 @@ class BaseLearner(ABC, BaseEstimator):
check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
force_all_finite=self.force_all_finite)
self.X_training, self.y_training = X, y
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
def predict(self, X: modALinput, **predict_kwargs) -> Any:

View File

@@ -29,6 +29,7 @@ from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import entropy, norm
from scipy.special import ndtr
from scipy import sparse as sp
@@ -824,6 +825,45 @@ class TestActiveLearner(unittest.TestCase):
query_idx, query_inst = learner.query(X_pool)
learner.teach(X_pool.iloc[query_idx], y_pool[query_idx])
def test_on_transformed_with_variable_transformation(self):
"""
Learnable transformations naturally change after a model is retrained. Make sure this is handled
properly for on_transformed=True query strategies.
"""
query_strategies = [
modAL.batch.uncertainty_batch_sampling
# add further strategies which work with instance representations
# no further ones as of 09.12.2020
]
X_labeled = ['Dog', 'Cat', 'Tree']
# contains unseen in labeled words, training model on those
# will alter CountVectorizer transformations
X_pool = ['Airplane', 'House']
y = [0, 1, 1, 0, 1] # irrelevant for test
for query_strategy in query_strategies:
learner = modAL.models.learners.ActiveLearner(
estimator=make_pipeline(
CountVectorizer(),
RandomForestClassifier(n_estimators=10)
),
query_strategy=query_strategy,
X_training=X_labeled, y_training=y[:len(X_labeled)],
on_transformed=True,
)
for _ in range(len(X_pool)):
query_idx, query_instance = learner.query(X_pool, n_instances=1)
i = query_idx[0]
learner.teach(
X=[X_pool[i]],
y=[y[i]]
)
def test_old_query_strategy_interface(self):
n_samples = 10
n_features = 5