Commit 2dca1339 authored by David Monllaó's avatar David Monllaó
Browse files

MDL-59988 analytics: Process pending training and prediction files

parent 325b3bdd
......@@ -202,6 +202,61 @@ class dataset_manager {
'/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/', self::EVALUATION_FILENAME);
}
/**
* Gets the list of files that couldn't be previously used for training and prediction.
*
* @param int $modelid
* @param bool $includetarget
* @param string[] $timesplittingids
* @return null
*/
public static function get_pending_files($modelid, $includetarget, $timesplittingids) {
global $DB;
$fs = get_file_storage();
if ($includetarget) {
$filearea = self::LABELLED_FILEAREA;
$usedfileaction = 'trained';
} else {
$filearea = self::UNLABELLED_FILEAREA;
$usedfileaction = 'predicted';
}
$select = 'modelid = :modelid AND action = :action';
$params = array('modelid' => $modelid, 'action' => $usedfileaction);
$usedfileids = $DB->get_fieldset_select('analytics_used_files', 'fileid', $select, $params);
// Very likely that we will only have 1 time splitting method here.
$filesbytimesplitting = array();
foreach ($timesplittingids as $timesplittingid) {
$filepath = '/timesplitting/' . self::clean_time_splitting_id($timesplittingid) . '/';
$files = $fs->get_directory_files(\context_system::instance()->id, 'analytics', $filearea, $modelid, $filepath);
foreach ($files as $file) {
// Discard evaluation files.
if ($file->get_filename() === self::EVALUATION_FILENAME) {
continue;
}
// No dirs.
if ($file->is_directory()) {
continue;
}
// Already used for training.
if (in_array($file->get_id(), $usedfileids)) {
continue;
}
$filesbytimesplitting[$timesplittingid][] = $file;
}
}
return $filesbytimesplitting;
}
/**
* Deletes previous evaluation files of this model.
*
......
......@@ -190,13 +190,13 @@ abstract class base {
list($analysables, $processedanalysables) = $this->get_sorted_analysables($includetarget);
$inittime = time();
foreach ($analysables as $analysable) {
foreach ($analysables as $key => $analysable) {
$files = $this->process_analysable($analysable, $includetarget);
// Later we will need to aggregate data by time splitting method.
foreach ($files as $timesplittingid => $file) {
$filesbytimesplitting[$timesplittingid][$analysable->get_id()] = $file;
$filesbytimesplitting[$timesplittingid][] = $file;
}
$this->update_analysable_analysed_time($processedanalysables, $analysable->get_id(), $includetarget);
......@@ -208,11 +208,35 @@ abstract class base {
break;
}
}
unset($analysables[$key]);
}
if ($this->options['evaluation'] === false) {
// Look for previous training and prediction files we generated and couldn't be used
// by machine learning backends because they weren't big enough.
$pendingfiles = \core_analytics\dataset_manager::get_pending_files($this->modelid, $includetarget,
array_keys($filesbytimesplitting));
foreach ($pendingfiles as $timesplittingid => $files) {
foreach ($files as $file) {
$filesbytimesplitting[$timesplittingid][] = $file;
}
}
}
// We join the datasets by time splitting method.
$timesplittingfiles = $this->merge_analysable_files($filesbytimesplitting, $includetarget);
if (!empty($pendingfiles)) {
// We must remove them now as they are already part of another dataset.
foreach ($pendingfiles as $timesplittingid => $files) {
foreach ($files as $file) {
$file->delete();
}
}
}
return $timesplittingfiles;
}
......
......@@ -693,7 +693,7 @@ class model {
$samplesfile = $samplesdata[$this->model->timesplitting];
// We need to throw an exception if we are trying to predict stuff that was already predicted.
$params = array('modelid' => $this->model->id, 'fileid' => $samplesfile->get_id(), 'action' => 'predicted');
$params = array('modelid' => $this->model->id, 'action' => 'predicted', 'fileid' => $samplesfile->get_id());
if ($predicted = $DB->get_record('analytics_used_files', $params)) {
throw new \moodle_exception('erroralreadypredict', 'analytics', '', $samplesfile->get_id());
}
......
......@@ -34,22 +34,30 @@ defined('MOODLE_INTERNAL') || die();
class dataset_manager_testcase extends advanced_testcase {
/**
* test_create_dataset
* setUp
*
* @return
* @return null
*/
public function test_create_dataset() {
public function setUp() {
$this->resetAfterTest(true);
$sharedtoprows = array(
$this->sharedtoprows = array(
array('var1', 'var2'),
array('value1', 'value2'),
array('header1', 'header2')
);
}
/**
* test_create_dataset
*
* @return null
*/
public function test_create_dataset() {
$dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset1->init_process();
$dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$f1 = $dataset1->store($dataset1data);
$dataset1->close_process();
......@@ -63,26 +71,19 @@ class dataset_manager_testcase extends advanced_testcase {
/**
* test_merge_datasets
*
* @return
* @return null
*/
public function test_merge_datasets() {
$this->resetAfterTest(true);
$sharedtoprows = array(
array('var1', 'var2'),
array('value1', 'value2'),
array('header1', 'header2')
);
$dataset1 = new \core_analytics\dataset_manager(1, 1, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset1->init_process();
$dataset1data = array_merge($sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$dataset1data = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$f1 = $dataset1->store($dataset1data);
$dataset1->close_process();
$dataset2 = new \core_analytics\dataset_manager(1, 2, 'whatever', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$dataset2->init_process();
$dataset2data = array_merge($sharedtoprows, array(array('no', 'no', 'no')));
$dataset2data = array_merge($this->sharedtoprows, array(array('no', 'no', 'no')));
$f2 = $dataset2->store($dataset2data);
$dataset2->close_process();
......@@ -97,4 +98,70 @@ class dataset_manager_testcase extends advanced_testcase {
$this->assertContains('value1', $mergedfilecontents);
$this->assertContains('header1', $mergedfilecontents);
}
/**
* test_get_pending_files
*
* @return null
*/
public function test_get_pending_files() {
global $DB;
$this->resetAfterTest();
$fakemodelid = 123;
$timesplittingids = array(
'\core\analytics\time_splitting\quarters',
'\core\analytics\time_splitting\quarters_accum',
);
// No files.
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
// We will reuse this analysable file to create training and prediction datasets (analysable level files are
// merged into training and prediction files).
$analysabledataset = new \core_analytics\dataset_manager($fakemodelid, 1, 'whatever',
\core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$analysabledataset->init_process();
$analysabledatasetdata = array_merge($this->sharedtoprows, array(array('yeah', 'yeah', 'yeah')));
$file = $analysabledataset->store($analysabledatasetdata);
$analysabledataset->close_process();
// Evaluation files ignored.
$evaluationdataset = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
'\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, true);
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids));
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
// Training and prediction files are not mixed up.
$trainingfile1 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
'\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$trainingfile2 = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
'\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::LABELLED_FILEAREA, false);
$bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
$this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
$this->assertCount(2, $bytimesplitting['\core\analytics\time_splitting\quarters']);
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
$predictionfile = \core_analytics\dataset_manager::merge_datasets(array($file), $fakemodelid,
'\core\analytics\time_splitting\quarters', \core_analytics\dataset_manager::UNLABELLED_FILEAREA, false);
$bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids);
$this->assertFalse(isset($bytimesplitting['\core\analytics\time_splitting\quarters_accum']));
$this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
// Already used for training and prediction are discarded.
$usedfile = (object)['modelid' => $fakemodelid, 'fileid' => $trainingfile1->get_id(), 'action' => 'trained',
'time' => time()];
$DB->insert_record('analytics_used_files', $usedfile);
$bytimesplitting = \core_analytics\dataset_manager::get_pending_files($fakemodelid, true, $timesplittingids);
$this->assertCount(1, $bytimesplitting['\core\analytics\time_splitting\quarters']);
$usedfile->fileid = $predictionfile->get_id();
$usedfile->action = 'predicted';
$DB->insert_record('analytics_used_files', $usedfile);
$this->assertEmpty(\core_analytics\dataset_manager::get_pending_files($fakemodelid, false, $timesplittingids));
}
}
......@@ -3687,7 +3687,7 @@
<KEY NAME="fileid" TYPE="foreign" FIELDS="fileid" REFTABLE="files" REFFIELDS="id"/>
</KEYS>
<INDEXES>
<INDEX NAME="modelidandfileidandaction" UNIQUE="false" FIELDS="modelid, fileid, action" COMMENT="Index on modelid and fileid and action"/>
<INDEX NAME="modelidandactionandfileid" UNIQUE="false" FIELDS="modelid, action, fileid" COMMENT="Index on modelid and action and fileid"/>
</INDEXES>
</TABLE>
<TABLE NAME="analytics_indicator_calc" COMMENT="Stored indicator calculations">
......
......@@ -2725,5 +2725,29 @@ function xmldb_main_upgrade($oldversion) {
upgrade_main_savepoint(true, 2017101200.00);
}
// Index modification upgrade step.
if ($oldversion < 2017101300.01) {
$table = new xmldb_table('analytics_used_files');
// Define index modelidandfileidandaction (not unique) to be dropped form analytics_used_files.
$index = new xmldb_index('modelidandfileidandaction', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'fileid', 'action'));
// Conditionally launch drop index modelidandfileidandaction.
if ($dbman->index_exists($table, $index)) {
$dbman->drop_index($table, $index);
}
// Define index modelidandactionandfileid (not unique) to be dropped form analytics_used_files.
$index = new xmldb_index('modelidandactionandfileid', XMLDB_INDEX_NOTUNIQUE, array('modelid', 'action', 'fileid'));
// Conditionally launch add index modelidandactionandfileid.
if (!$dbman->index_exists($table, $index)) {
$dbman->add_index($table, $index);
}
// Main savepoint reached.
upgrade_main_savepoint(true, 2017101300.01);
}
return true;
}
......@@ -29,7 +29,7 @@
defined('MOODLE_INTERNAL') || die();
$version = 2017101300.00; // YYYYMMDD = weekly release date of this DEV branch.
$version = 2017101300.01; // YYYYMMDD = weekly release date of this DEV branch.
// RR = release increments - 00 in DEV branches.
// .XX = incremental changes.
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment