fromtypingimportList, Generatorimportnumpyasnpfromsklearn.model_selection._splitimport_BaseKFoldfromsklearn.utils.validationimportindexable, _num_samplesclassMonteCarloCV(_BaseKFold):
def__init__(self,
n_splits: int,
train_size: float,
test_size: float,
gap: int=0):
"""MonteCarloCross-ValidationHoldoutappliedinmultipletestingperiodsTestingorigin (time-stepwheretestingbegins) israndomlychosenaccordingtoamontecarlosimulation
:paramn_splits: (int) Numberofmontecarlorepetitionsintheprocedure
:paramtrain_size: (float) Trainsize, intermsofratioofthetotallengthoftheseries
:paramtest_size: (float) Testsize, intermsofratioofthetotallengthoftheseries
:paramgap: (int) Numberofsamplestoexcludefromtheendofeachtrainsetbeforethetestset.
"""self.n_splits=n_splitsself.n_samples=-1self.gap=gapself.train_size=train_sizeself.test_size=test_sizeself.train_n_samples=0self.test_n_samples=0self.mc_origins= []
defsplit(self, X, y=None, groups=None) ->Generator:
"""Generate indices to split data into training and test set.Parameters----------X : array-likeofshape (n_samples, n_features)
Trainingdata, where`n_samples`isthenumberofsamplesand`n_features`isthenumberoffeatures.
y : array-likeofshape (n_samples,)
Alwaysignored, existsforcompatibility.
groups : array-likeofshape (n_samples,)
Alwaysignored, existsforcompatibility.
Yields------train : ndarrayThetrainingsetindicesforthatsplit.
test : ndarrayThetestingsetindicesforthatsplit.
"""X, y, groups=indexable(X, y, groups)
self.n_samples=_num_samples(X)
self.train_n_samples=int(self.n_samples*self.train_size) -1self.test_n_samples=int(self.n_samples*self.test_size) -1
# Makesurewehaveenoughsamplesforthegivensplitparametersifself.n_splits>self.n_samples:
raiseValueError(
f'Cannot have number of folds={self.n_splits} greater'f' than the number of samples={self.n_samples}.'
)
ifself.train_n_samples-self.gap<=0:
raiseValueError(
f'The gap={self.gap} is too big for number of training samples'f'={self.train_n_samples} with testing samples={self.test_n_samples} and gap={self.gap}.'
)
indices=np.arange(self.n_samples)
selection_range=np.arange(self.train_n_samples+1, self.n_samples-self.test_n_samples-1)
self.mc_origins= \
np.random.choice(a=selection_range,
size=self.n_splits,
replace=True)
fororigininself.mc_origins:
ifself.gap>0:
train_end=origin-self.gap+1else:
train_end=origin-self.gaptrain_start=origin-self.train_n_samples-1test_end=origin+self.test_n_samplesyield (
indices[train_start:train_end],
indices[origin:test_end],
)
defget_origins(self) ->List[int]:
returnself.mc_origins