From 1c38d61ef0471c537ca91d8a1fb6ff1b49f4b17a Mon Sep 17 00:00:00 2001 From: nalenz Date: Mon, 8 Apr 2019 10:48:53 +0200 Subject: [PATCH] Improve style and refactor code for Bayesian model --- README.md | 4 +- bayesean-rerun.sh | 6 +- workers/formatter-bayesean-book/index.js | 157 +++++------ workers/formatter-bayesean-show/index.js | 260 +++++++++--------- workers/postprocessor-bayesean-book/index.js | 46 ++-- workers/postprocessor-bayesean-show/index.js | 60 ++-- workers/predictor-bayesean-book/index.js | 20 -- workers/predictor-bayesean-book/predictor.py | 120 -------- workers/predictor-bayesean-show/index.js | 20 -- workers/predictor-bayesean-show/predictor.py | 127 --------- .../book_predictor_output.json | 2 +- .../predictor-bayesean-book/index.js | 27 ++ .../predictor-bayesean-book/predictor.py | 120 ++++++++ .../predictor-bayesean-show/index.js | 27 ++ .../predictor-bayesean-show/predictor.py | 127 +++++++++ .../show_predictor_output.json | 2 +- workers/predictors-bayesian/requirements.txt | 6 + workers/uploader-attributes-bayesean/index.js | 28 +- .../uploader-predictions-bayesean/index.js | 77 +++--- 19 files changed, 633 insertions(+), 603 deletions(-) delete mode 100644 workers/predictor-bayesean-book/index.js delete mode 100644 workers/predictor-bayesean-book/predictor.py delete mode 100644 workers/predictor-bayesean-show/index.js delete mode 100644 workers/predictor-bayesean-show/predictor.py rename workers/{ => predictors-bayesian}/predictor-bayesean-book/book_predictor_output.json (99%) create mode 100644 workers/predictors-bayesian/predictor-bayesean-book/index.js create mode 100644 workers/predictors-bayesian/predictor-bayesean-book/predictor.py create mode 100644 workers/predictors-bayesian/predictor-bayesean-show/index.js create mode 100644 workers/predictors-bayesian/predictor-bayesean-show/predictor.py rename workers/{ => predictors-bayesian}/predictor-bayesean-show/show_predictor_output.json (99%) create mode 100644 workers/predictors-bayesian/requirements.txt diff --git a/README.md b/README.md index 0d89db3..a956948 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Setup -To run the code in this repository, Node (at least version 10) is needed. Additionally, to run the `predictors-neural` workers, make sure that Python (at least version 3.5) is installed and that you have the dependencies (i.e. run `pip install -r requirements.txt` in that directory). The code in this repository was only tested using Ubuntu 16.04. +To run the code in this repository, Node (at least version 10) is needed. Additionally, to run the predictor workers, make sure that Python (at least version 3.5) is installed and that you have the dependencies (i.e. run `pip3 install -r workers/predictors-bayesian/requirements.txt` and `pip3 install -r workers/predictors-neural/requirements.txt`). The code in this repository was only tested using Ubuntu 16.04. Please run `npm install` after cloning the repository to install all dependencies or when the dependencies changed after pulling. Afterwards, use Visual Studio Code as your IDE to immediately start working with ESLint and Prettier being directly integrated then. @@ -14,7 +14,7 @@ The Bayesean model can be used as follows: 1. If you need to, refetch the data by running `./refetch.sh` in `data/book` and `data/show`. 2. Run `node workers/formatter-bayesean-book` and `node workers/formatter-bayesean-show`. They will read out the features for training used for data and will generate a JSON file in their own directory (`training_book_characters.json` or `training_show_characters.json`). -3. Run the predictor scripts in `workers/predictor-bayesean-book` and `workers/predictor-bayesean-show`. This can be done directly (`python3 workers/predictor-bayesean-book/predictor.py`) or using Node (`node workers/predictor-bayesean-book`). +3. Run the predictor scripts in `workers/predictors-bayesian/predictor-bayesean-book` and `workers/predictors-bayesian/predictor-bayesean-show`. This can be done directly (`python3 workers/predictors-bayesian/predictor-bayesean-book/predictor.py`) or using Node (`node workers/predictors-bayesian/predictor-bayesean-book`). 4. The predictors will produce an output JSON in their own directory (`book_predictor_output.json`, `show_predictor_output.json`). Run the postprocessors to filter out dead characters and the unnecessary data: `node workers/postprocessor-bayesean-book`, `node workers/postprocessor-bayesean-show`. 5. To upload the predictions to the website, use `node workers/uploader-predictions-bayesean`. To upload only the attributes used and their average influences, use `node workers/uploader-attributes-bayesean`. diff --git a/bayesean-rerun.sh b/bayesean-rerun.sh index 033ffca..0486977 100644 --- a/bayesean-rerun.sh +++ b/bayesean-rerun.sh @@ -4,8 +4,8 @@ cd ../show ./refetch.sh node workers/formatter-bayesean-book node workers/formatter-bayesean-show -node workers/predictor-bayesean-book -node workers/predictor-bayesean-show +node workers/predictors-bayesian/predictor-bayesean-book +node workers/predictors-bayesian/predictor-bayesean-show node workers/postprocessor-bayesean-book node workers/postprocessor-bayesean-show -node workers/uploader-attributes-bayesean \ No newline at end of file +node workers/uploader-attributes-bayesean diff --git a/workers/formatter-bayesean-book/index.js b/workers/formatter-bayesean-book/index.js index a0b4fb3..3cafa0d 100644 --- a/workers/formatter-bayesean-book/index.js +++ b/workers/formatter-bayesean-book/index.js @@ -6,37 +6,37 @@ We use integers (0 or 1) for the flags, because the model at the end will use them like that. */ /*************************************************************************************************/ -//CONSTANTS +// CONSTANTS const utils = require('../common/utils'); const config = require('../common/config'); const fs = require('fs-extra'); const path = require('path'); -const outfile = path.resolve(__dirname, "training_book_characters.json"); +const outfile = path.resolve(__dirname, 'training_book_characters.json'); -const LOCATION_VISITED_THRESHOLD = 50; //min. amount of people need to visit a location before it's used -const HOUSE_THRESHOLD = 10; //min. amount of people in this house before it's used -const CULTURES_THRESHOLD = 10; //min. amount of people with this culture before it's used -const AGE_THRESHOLD = 100; //alive characters above this age are considered to be errors +// const LOCATION_VISITED_THRESHOLD = 50; // min. amount of people need to visit a location before it's used +const HOUSE_THRESHOLD = 10; // min. amount of people in this house before it's used +const CULTURES_THRESHOLD = 10; // min. amount of people with this culture before it's used +const AGE_THRESHOLD = 100; // alive characters above this age are considered to be errors /*************************************************************************************************/ -//COLLECTOR FUNCTIONS (will collect data from the data-mined model) +// COLLECTOR FUNCTIONS (will collect data from the data-mined model) function isAlive(character) { return character.alive; } function isSuitableChar(character) { - if(character.alive == undefined || character.alive == null) { + if (character.alive === undefined || character.alive === null) { return false; - } else if(character.alive == false && (character.death == null || character.death == undefined)) { + } else if (character.alive === false && (character.death === null || character.death === undefined)) { return false; - } else if(character.birth == undefined || character.birth == null) { + } else if (character.birth === undefined || character.birth === null) { return false; // character's date of birth is missing } else if (character.alive && config.GOT_CURRENT_YEAR_BOOK - character.birth > AGE_THRESHOLD) { return false; // character has no date of death, but is apparently over AGE_THRESHOLD years old - } else if(character.death < character.birth) { + } else if (character.death < character.birth) { return false; } return true; @@ -44,98 +44,98 @@ function isSuitableChar(character) { function filterChars(unfilteredChars) { let characters = []; - for(let ch of unfilteredChars) { - if(isSuitableChar(ch)) { + for (let ch of unfilteredChars) { + if (isSuitableChar(ch)) { characters.push(ch); } } return characters; } -function collectLocations(charLocations, filteredChars, locMap) { +/*function collectLocations(charLocations, filteredChars, locMap) { let locations_all = []; //all locations we might have flags for for (let c_l of charLocations) { // now check if any new locations will come to the locations array for (let loc of c_l.locations) { - if (locations_all.includes (loc) == false) { + if (locations_all.includes(loc) === false) { //new location is not contained in the array, add it locations_all.push(loc); } } } - + // now, filter locations that have had at least LOCATION_VISITED_THRESHOLD // or more suitable characters visit them let locations = []; - for(let l of locations_all) { + for (let l of locations_all) { loc_counter = 0; - for(let c of filteredChars) { + for (let c of filteredChars) { visited = locMap.get(c.name); - if(visited != undefined && visited.includes(l)) { + if (visited !== undefined && visited.includes(l)) { loc_counter += 1; } } - if(loc_counter >= LOCATION_VISITED_THRESHOLD) { + if (loc_counter >= LOCATION_VISITED_THRESHOLD) { locations.push(l); } } - + return locations; -} +}*/ -function genLocationMap(charLocations) { +/*function genLocationMap(charLocations) { let locKeyValuePairs = []; //map character name => array of visited locations for (let c_l of charLocations) { - // push the name + location array of the character into locKeyValuePairs - locKeyValuePairs.push([c_l.name, c_l.locations]); + // push the name + location array of the character into locKeyValuePairs + locKeyValuePairs.push([c_l.name, c_l.locations]); } //build the Map from the key-value pair array let locMap = new Map(locKeyValuePairs); return locMap; -} +}*/ function collectHouses(unfilteredHouses, filteredChars) { // only consider houses with at least HOUSE_THRESHOLD suitable characters in them let houses = []; - for(let h of unfilteredHouses) { + for (let h of unfilteredHouses) { let house_counter = 0; - for(let ch of filteredChars) { - if(ch.house == h.name) { + for (let ch of filteredChars) { + if (ch.house === h.name) { house_counter += 1; } - } - if(house_counter >= HOUSE_THRESHOLD) { - houses.push(h); - } + } + if (house_counter >= HOUSE_THRESHOLD) { + houses.push(h); + } } - + return houses; } function collectCultures(unfilteredCultures, filteredChars) { // only consider cultures with at least CULTURES_THRESHOLD suitable characters in them let cultures = []; - for(let c of unfilteredCultures) { + for (let c of unfilteredCultures) { let culture_counter = 0; - for(let ch of filteredChars) { - if(ch.culture == c.name) { + for (let ch of filteredChars) { + if (ch.culture === c.name) { culture_counter += 1; } } - if(culture_counter >= CULTURES_THRESHOLD) { - cultures.push(c); + if (culture_counter >= CULTURES_THRESHOLD) { + cultures.push(c); } } - + return cultures; } function getMaxRank(characters) { //max pageRank from all characters let max = 0; - for(let ch of characters) { - if(ch.pageRank != null && ch.pagerank != undefined && ch.pageRank.rank > max) { - max = ch.pageRank.rank; + for (let ch of characters) { + if (ch.pageRank !== null && ch.pagerank !== undefined && ch.pagerank.rank > max) { + max = ch.pagerank.rank; } } return max; @@ -161,14 +161,13 @@ function processAge(srcChar, destChar) { function processGender(srcChar, destChar) { // "male" flag = 1 if male if (srcChar.gender !== undefined && srcChar.gender !== null) { - if (srcChar.gender == "male") { + if (srcChar.gender === 'male') { destChar.male = 1; - } - else { + } else { destChar.male = 0; } - } - else { //No gender? + } else { + //No gender? destChar.male = 0; } } @@ -176,7 +175,7 @@ function processGender(srcChar, destChar) { function processHouses(srcChar, destChar, houses) { // for each suitable house, add a flag = 1 if the character is in that house for (let h of houses) { - if (h.name == null || h.name == undefined) continue; + if (h.name === null || h.name === undefined) continue; if (srcChar.house === h.name) { // character IS in this house destChar[h.name] = 1; @@ -185,11 +184,11 @@ function processHouses(srcChar, destChar, houses) { destChar[h.name] = 0; } } - + // also set the house flag to = 1 if the character has pledged allegiance to it if (srcChar.allegiance !== null && srcChar.allegiance !== undefined) { for (let h of srcChar.allegiance) { - if (destChar[h] == 0 || destChar[h] == 1) { + if (destChar[h] === 0 || destChar[h] === 1) { destChar[h] = 1; } } @@ -208,7 +207,7 @@ function processCultures(srcChar, destChar, cultures) { } function processTitles(srcChar, destChar) { - if(srcChar["titles"] != undefined && srcChar["titles"].length > 0) { + if (srcChar['titles'] !== undefined && srcChar['titles'].length > 0) { destChar.hasTitles = 1; } else { destChar.hasTitles = 0; @@ -218,7 +217,7 @@ function processTitles(srcChar, destChar) { function processSpouses(srcChar, destChar, characters) { // is the character married? // TODO cover the case where somebody else has srcChar as a spouse, but not vice versa - if (srcChar["spouse"] != undefined && srcChar["spouse"] != null && srcChar["spouse"].length > 0) { + if (srcChar['spouse'] !== undefined && srcChar['spouse'] !== null && srcChar['spouse'].length > 0) { destChar.isMarried = 1; //destChar.hasDeadSpouse = 0; //determine whether character has a dead spouse @@ -236,7 +235,7 @@ function processSpouses(srcChar, destChar, characters) { } else { //spouse is not an array, so only one spouse for(let ch of characters) { - if(ch.name == srcChar["spouse"]) { + if(ch.name === srcChar["spouse"]) { if(!isAlive(ch)) { //the spouse is dead destChar.hasDeadSpouse = 1; @@ -245,7 +244,8 @@ function processSpouses(srcChar, destChar, characters) { } } }*/ - } else { //no spouses + } else { + //no spouses destChar.isMarried = 0; //destChar.hasDeadSpouse = 0; } @@ -262,9 +262,9 @@ function processLocations(srcChar, destChar, locations, locMap) { if (visited !== null && visited !== undefined) { // set the flag to 1 for all locations in the visited array for (let loc of visited) { - if(locations.includes(loc)) { + if (locations.includes(loc)) { destChar[loc] = 1; - } + } } } } @@ -272,7 +272,7 @@ function processLocations(srcChar, destChar, locations, locMap) { function processParents(srcChar, destChar, characters) { //first: Is srcChar somebody's parent? destChar.hasChildren = 0; - if(srcChar.children != null && srcChar.children != undefined && srcChar.children.length > 0) { + if (srcChar.children !== null && srcChar.children !== undefined && srcChar.children.length > 0) { destChar.hasChildren = 1; } } @@ -280,8 +280,8 @@ function processParents(srcChar, destChar, characters) { function processHeir(srcChar, destChar, characters) { //is srcChar somebody's heir? destChar.isHeir = 0; - for(let ch of characters) { - if(ch.heir == srcChar.name) { + for (let ch of characters) { + if (ch.heir === srcChar.name) { destChar.isHeir = 1; break; } @@ -289,7 +289,8 @@ function processHeir(srcChar, destChar, characters) { } function processRank(srcChar, destChar, maxRank) { - if(srcChar.pageRank != null && srcChar.pageRank != undefined && srcChar.pageRank.rank > (0.34 * maxRank)) { //this check is similar to the 2016 project + if (srcChar.pageRank !== null && srcChar.pageRank !== undefined && srcChar.pageRank.rank > 0.34 * maxRank) { + //this check is similar to the 2016 project destChar.isMajor = 1; } else { destChar.isMajor = 0; @@ -298,7 +299,7 @@ function processRank(srcChar, destChar, maxRank) { /*************************************************************************************************/ -async function genTrainingData (callback) { +async function genTrainingData(callback) { // read the needed JSON files let [characters_unfiltered, houses_unfiltered, cultures_unfiltered, character_locations] = await Promise.all([ utils.loadBookData('characters'), @@ -306,14 +307,14 @@ async function genTrainingData (callback) { utils.loadBookData('cultures'), utils.loadBookData('characterLocations'), ]); - + let characters = filterChars(characters_unfiltered); // filter out unsuitable characters //let locMap = genLocationMap(character_locations); // generate a character-to-locations map //let locations = collectLocations(character_locations, characters, locMap); // collect locations and filter them let houses = collectHouses(houses_unfiltered, characters); // collect houses and filter them let cultures = collectCultures(cultures_unfiltered, characters); // collect cultures and filter them let maxRank = getMaxRank(characters); //max pageRank can determine who is a major character - + // in training_chars, we will accumulate the character data used for training let training_chars = []; @@ -321,21 +322,21 @@ async function genTrainingData (callback) { for (let ch of characters) { // this will be the reformatted character let ref_ch = {}; - - ref_ch.name = ch.name; // copy the name - processAge(ch, ref_ch); // process age-related data - processGender(ch, ref_ch); // process gender data - processHouses(ch, ref_ch, houses); // process house data - processCultures(ch, ref_ch, cultures); // process culture data - processTitles(ch, ref_ch); // process titles data - processSpouses(ch, ref_ch, characters_unfiltered); // process spouses data - //processLocations(ch, ref_ch, locations, locMap); // process location data + + ref_ch.name = ch.name; // copy the name + processAge(ch, ref_ch); // process age-related data + processGender(ch, ref_ch); // process gender data + processHouses(ch, ref_ch, houses); // process house data + processCultures(ch, ref_ch, cultures); // process culture data + processTitles(ch, ref_ch); // process titles data + processSpouses(ch, ref_ch, characters_unfiltered); // process spouses data + //processLocations(ch, ref_ch, locations, locMap); // process location data processParents(ch, ref_ch, characters_unfiltered); processHeir(ch, ref_ch, characters_unfiltered); //processRank(ch, ref_ch, maxRank); //TODO books the character was in //TODO consider dead parents/heirs/spouses somehow - + // push the reformatted character and move on to the next one training_chars.push(ref_ch); } @@ -343,8 +344,8 @@ async function genTrainingData (callback) { // output ready // Wanted some more readable JSON here :) let readableJSON = JSON.stringify(training_chars, null, 2); - fs.writeFile(outfile, readableJSON, (err) => { - if(err) throw err; + fs.writeFile(outfile, readableJSON, err => { + if (err) throw err; //signal async completion callback(); }); @@ -353,4 +354,6 @@ async function genTrainingData (callback) { exports.formatBookData = genTrainingData; //call the function -genTrainingData(() => {console.log("Formatting book characters complete!");}); \ No newline at end of file +genTrainingData(() => { + console.log('Formatting book characters complete!'); +}); diff --git a/workers/formatter-bayesean-show/index.js b/workers/formatter-bayesean-show/index.js index c9b5de5..6e20ac6 100644 --- a/workers/formatter-bayesean-show/index.js +++ b/workers/formatter-bayesean-show/index.js @@ -13,32 +13,32 @@ const config = require('../common/config'); const fs = require('fs-extra'); const path = require('path'); -const outfile = path.resolve(__dirname, "training_show_characters.json"); +const outfile = path.resolve(__dirname, 'training_show_characters.json'); -const LOCATION_VISITED_THRESHOLD = 50; //min. amount of people need to visit a location before it's used +//const LOCATION_VISITED_THRESHOLD = 50; //min. amount of people need to visit a location before it's used const HOUSE_THRESHOLD = 5; //min. amount of people in this house before it's used const CULTURES_THRESHOLD = 10; //min. amount of people with this culture before it's used /*************************************************************************************************/ -//COLLECTOR FUNCTIONS (will collect data from the data-mined model) +// COLLECTOR FUNCTIONS (will collect data from the data-mined model) function isSuitableChar(character) { - if(character.alive == undefined || character.alive == null) { - return false; - } else if ((character.death == null || character.death == undefined) && character.alive == false) { + if (character.alive === undefined || character.alive === null) { + return false; + } else if ((character.death === null || character.death === undefined) && character.alive === false) { return false; // character is dead, but has no date of death } else if (character.death > config.GOT_SHOW_BEGIN + 6) { - return false; //character apparently died after season 7? + return false; // character apparently died after season 7? } return true; } function filterChars(unfilteredChars) { let characters = []; - for(let ch of unfilteredChars) { - if(isSuitableChar(ch)) { + for (let ch of unfilteredChars) { + if (isSuitableChar(ch)) { characters.push(ch); - } + } } return characters; } @@ -46,66 +46,67 @@ function filterChars(unfilteredChars) { function collectHouses(filteredChars) { // first, collect all the houses from the characters' tags let unfilteredHouses = []; - for(let ch of filteredChars) { - for(let h of ch.allegiances) { - if(h != undefined && h != null && !(unfilteredHouses.includes(h))) { - unfilteredHouses.push(h); - } - } + for (let ch of filteredChars) { + for (let h of ch.allegiances) { + if (h !== undefined && h !== null && !unfilteredHouses.includes(h)) { + unfilteredHouses.push(h); + } + } } - + // only consider houses with at least HOUSE_THRESHOLD suitable characters in them let houses = []; - for(let h of unfilteredHouses) { + for (let h of unfilteredHouses) { let house_counter = 0; - for(let ch of filteredChars) { - if(ch.allegiances.includes(h)) { + for (let ch of filteredChars) { + if (ch.allegiances.includes(h)) { house_counter += 1; - } - } - if(house_counter >= HOUSE_THRESHOLD && h.includes("House")) { //why is the Night's Watch considered a house anyway? + } + } + if (house_counter >= HOUSE_THRESHOLD && h.includes('House')) { + // why is the Night's Watch considered a house anyway? houses.push(h); - } + } } - + return houses; } function collectCultures(filteredChars) { // collect all cultures from the character data let unfilteredCultures = []; - for(let ch of filteredChars) { - if(ch.cultures == undefined || ch.cultures == null) { - continue; - } - for(let cult of ch.cultures) { - if(!(unfilteredCultures.includes(cult))) { - unfilteredCultures.push(cult); - } - } + for (let ch of filteredChars) { + if (ch.cultures === undefined || ch.cultures === null) { + continue; + } + for (let cult of ch.cultures) { + if (!unfilteredCultures.includes(cult)) { + unfilteredCultures.push(cult); + } + } } - + // only consider cultures with at least CULTURES_THRESHOLD suitable characters in them let cultures = []; - for(let c of unfilteredCultures) { + for (let c of unfilteredCultures) { let culture_counter = 0; - for(let ch of filteredChars) { - if(ch.culture == c) { + for (let ch of filteredChars) { + if (ch.culture === c) { culture_counter += 1; - } - } - if(culture_counter >= CULTURES_THRESHOLD) { + } + } + if (culture_counter >= CULTURES_THRESHOLD) { cultures.push(c); - } + } } - + return cultures; } function getMaxPagerank(characters) { let max_rank = 0; - for(let ch of characters) { - if(ch.pagerank != null && ch.pagerank != undefined && ch.pagerank.rank > max_rank) { + for (let ch of characters) { + if (ch.pagerank !== null && ch.pagerank !== undefined && ch.pagerank.rank > max_rank) { max_rank = ch.pagerank.rank; } } @@ -113,14 +114,14 @@ function getMaxPagerank(characters) { } /*************************************************************************************************/ -//FORMATTER FUNCTIONS (will use the collected data to add flags to a reformatted character model) -//They do this as a side effect and do not return anything. +// FORMATTER FUNCTIONS (will use the collected data to add flags to a reformatted character model) +// They do this as a side effect and do not return anything. function processAge(srcChar, destChar) { // use absolute time, since birth dates are generally unavailable // check whether character alive or not and calculate "age", i.e. how long in the show they lived destChar.isDead = !srcChar.alive ? 1 : 0; - + if (!srcChar.alive) { // dead destChar.livedTo = srcChar.death - config.GOT_SHOW_BEGIN; @@ -133,46 +134,45 @@ function processAge(srcChar, destChar) { function processGender(srcChar, destChar) { // "male" flag = 1 if male if (srcChar.gender !== undefined && srcChar.gender !== null) { - if (srcChar.gender == "male") { - destChar.male = 1; - } - else { - destChar.male = 0; - } - } - else { //No gender? + if (srcChar.gender === 'male') { + destChar.male = 1; + } else { + destChar.male = 0; + } + } else { + //No gender? destChar.male = 0; } } function processHouses(srcChar, destChar, houses) { // for each suitable house, add a flag = 1 if the character is in that house - numHouses = 0; - + // let numHouses = 0; + for (let h of houses) { if (srcChar.house === h) { // character IS in this house - numHouses += 1; + // numHouses += 1; destChar[h] = 1; } else { // character is NOT in this house destChar[h] = 0; } } - + // also set the house flag to = 1 if the character has pledged allegiance to it if (srcChar.allegiances !== null && srcChar.allegiances !== undefined) { for (let h of srcChar.allegiances) { - if(houses.includes(h)) { - if(destChar[h] == 0) { + if (houses.includes(h)) { + /*if (destChar[h] === 0) { numHouses += 1; - } - destChar[h] = 1; - } - } + }*/ + destChar[h] = 1; + } + } } - - //destChar["multipleHouses"] = numHouses > 1 ? 1 : 0; //allegiance to multiple houses indicates changing allegiances. Weak predictor. + + //destChar["multipleHouses"] = numHouses > 1 ? 1 : 0; // allegiance to multiple houses indicates changing allegiances. Weak predictor. } function processCultures(srcChar, destChar, cultures) { @@ -187,8 +187,8 @@ function processCultures(srcChar, destChar, cultures) { } function processTitles(srcChar, destChar) { - if(srcChar["titles"] != undefined && srcChar["titles"].length != undefined) { - destChar.hasTitles = srcChar["titles"].length > 0 ? 1 : 0 + if (srcChar['titles'] !== undefined && srcChar['titles'].length !== undefined) { + destChar.hasTitles = srcChar['titles'].length > 0 ? 1 : 0; } else { destChar.hasTitles = 0; } @@ -196,7 +196,7 @@ function processTitles(srcChar, destChar) { function processSpouses(srcChar, destChar) { // whether the character is married - if (srcChar["spouse"] != undefined && srcChar["spouse"].length != undefined) { + if (srcChar['spouse'] !== undefined && srcChar['spouse'].length !== undefined) { destChar.isMarried = 1; } else { destChar.isMarried = 0; @@ -204,36 +204,34 @@ function processSpouses(srcChar, destChar) { } function processLovers(srcChar, destChar) { - if(srcChar["lovers"] != undefined && srcChar["lovers"] != null && srcChar["lovers"].length > 0) { + if (srcChar['lovers'] !== undefined && srcChar['lovers'] !== null && srcChar['lovers'].length > 0) { destChar.hasLovers = 1; - } - else { + } else { destChar.hasLovers = 0; } } function processSiblings(srcChar, destChar) { - if(srcChar["siblings"] != undefined && srcChar["siblings"] != null && srcChar["siblings"].length > 0) { + if (srcChar['siblings'] !== undefined && srcChar['siblings'] !== null && srcChar['siblings'].length > 0) { destChar.hasSiblings = 1; - } - else { + } else { destChar.hasSiblings = 0; } } function processParent(srcChar, destChar, characters) { destChar.isParent = 0; - for(let ch of characters) { - if(ch.name == srcChar["mother"] || ch.name == srcChar["mother"]) { + for (let ch of characters) { + if (ch.name === srcChar['mother'] || ch.name === srcChar['mother']) { destChar.isParent = 1; } } } function processPagerank(srcChar, destChar, maxRank) { - if(srcChar.pagerank != null && srcChar.pagerank != undefined && srcChar.pagerank.rank >= 0.34 * maxRank) { + if (srcChar.pagerank != null && srcChar.pagerank !== undefined && srcChar.pagerank.rank >= 0.34 * maxRank) { destChar.isMajor = 1; - } else{ + } else { destChar.isMajor = 0; } } @@ -244,55 +242,55 @@ function processDeadRelations(srcChar, destChar, unfilteredChars) { destChar.isFatherDead = 0; destChar.isSpouseDead = 0; destChar.hasDeadLovers = 0; - //destChar.hasDeadChild = 0; //TODO do this - - //siblings - if(srcChar.siblings != undefined && srcChar.siblings != null) { - outloop_siblings: for(let sibling of srcChar.siblings) { - for(let ch of unfilteredChars) { - if(ch.name == sibling && ch.alive == false) { + // destChar.hasDeadChild = 0; // TODO do this + + // siblings + if (srcChar.siblings !== undefined && srcChar.siblings !== null) { + outloop_siblings: for (let sibling of srcChar.siblings) { + for (let ch of unfilteredChars) { + if (ch.name === sibling && ch.alive === false) { destChar.hasDeadSiblings = 1; break outloop_siblings; } } } } - - //father - if(srcChar.father != undefined && srcChar.father != null) { - for(let ch of unfilteredChars) { - if(ch.name == srcChar.father && ch.alive == false) { + + // father + if (srcChar.father !== undefined && srcChar.father !== null) { + for (let ch of unfilteredChars) { + if (ch.name === srcChar.father && ch.alive === false) { destChar.isFatherDead = 1; break; } } } - - //mother - if(srcChar.mother != undefined && srcChar.mother != null) { - for(let ch of unfilteredChars) { - if(ch.name == srcChar.mother && ch.alive == false) { + + // mother + if (srcChar.mother !== undefined && srcChar.mother !== null) { + for (let ch of unfilteredChars) { + if (ch.name === srcChar.mother && ch.alive === false) { destChar.isMotherDead = 1; break; } } } - - //spouse - if(srcChar.spouse != undefined && srcChar.spouse != null) { - for(let ch of unfilteredChars) { - if(ch.name == srcChar.spouse && ch.alive == false) { + + // spouse + if (srcChar.spouse !== undefined && srcChar.spouse !== null) { + for (let ch of unfilteredChars) { + if (ch.name === srcChar.spouse && ch.alive === false) { destChar.isSpouseDead = 1; break; } } } - - //lovers - if(srcChar.lovers != undefined && srcChar.lovers != null) { - outloop_lovers: for(let lover of srcChar.lovers) { - for(let ch of unfilteredChars) { - if(ch.name == lover && ch.alive == false) { + + // lovers + if (srcChar.lovers !== undefined && srcChar.lovers !== null) { + outloop_lovers: for (let lover of srcChar.lovers) { + for (let ch of unfilteredChars) { + if (ch.name === lover && ch.alive === false) { destChar.hasDeadLovers = 1; break outloop_lovers; } @@ -304,7 +302,7 @@ function processDeadRelations(srcChar, destChar, unfilteredChars) { function processBastards(srcChar, destChar, bastards) { destChar.isBastard = 0; for (let b of bastards) { - if (b.name == srcChar.name) { + if (b.name === srcChar.name) { destChar.isBastard = 1; break; } @@ -313,13 +311,10 @@ function processBastards(srcChar, destChar, bastards) { /*************************************************************************************************/ -async function genTrainingData (callback) { +async function genTrainingData(callback) { // read the needed JSON files - let [characters_unfiltered, bastards] = await Promise.all([ - utils.loadShowData('characters'), - utils.loadShowData('bastards'), - ]); - + let [characters_unfiltered, bastards] = await Promise.all([utils.loadShowData('characters'), utils.loadShowData('bastards')]); + let characters = filterChars(characters_unfiltered); // filter out unsuitable characters let houses = collectHouses(characters); // collect houses and filter them let cultures = collectCultures(characters); // collect cultures and filter them @@ -332,21 +327,21 @@ async function genTrainingData (callback) { for (let ch of characters) { // this will be the reformatted character let ref_ch = {}; - - ref_ch.name = ch.name; // copy the name - processAge(ch, ref_ch); // process age-related data - processGender(ch, ref_ch); // process gender data - processHouses(ch, ref_ch, houses); // process house data - processCultures(ch, ref_ch, cultures); // process culture data - processTitles(ch, ref_ch); // process titles data - processSpouses(ch, ref_ch); // process spouses data + + ref_ch.name = ch.name; // copy the name + processAge(ch, ref_ch); // process age-related data + processGender(ch, ref_ch); // process gender data + processHouses(ch, ref_ch, houses); // process house data + processCultures(ch, ref_ch, cultures); // process culture data + processTitles(ch, ref_ch); // process titles data + processSpouses(ch, ref_ch); // process spouses data processLovers(ch, ref_ch); - //processSiblings(ch, ref_ch); //not influential - //processParent(ch, ref_ch, characters); //not influential + // processSiblings(ch, ref_ch); //not influential + // processParent(ch, ref_ch, characters); //not influential processPagerank(ch, ref_ch, maxRank); processDeadRelations(ch, ref_ch, characters_unfiltered); processBastards(ch, ref_ch, bastards); - + // push the reformatted character and move on to the next one training_chars.push(ref_ch); } @@ -354,14 +349,15 @@ async function genTrainingData (callback) { // output ready // Wanted some more readable JSON here :) let readableJSON = JSON.stringify(training_chars, null, 2); - fs.writeFile(outfile, readableJSON, (err) => { - if(err) throw err; + fs.writeFile(outfile, readableJSON, err => { + if (err) throw err; callback(); }); - } exports.formatShowData = genTrainingData; -//call the function -genTrainingData(() => {console.log("Formatting show characters complete!");}); \ No newline at end of file +// call the function +genTrainingData(() => { + console.log('Formatting show characters complete!'); +}); diff --git a/workers/postprocessor-bayesean-book/index.js b/workers/postprocessor-bayesean-book/index.js index 3bce462..639c236 100644 --- a/workers/postprocessor-bayesean-book/index.js +++ b/workers/postprocessor-bayesean-book/index.js @@ -1,22 +1,21 @@ -'use strict' +'use strict'; const fs = require('fs'); const path = require('path'); -const outfile = path.resolve(__dirname, "book_predictions.json"); -const infile = path.resolve(__dirname, "../predictor-bayesean-book/book_predictor_output.json"); +const outfile = path.resolve(__dirname, 'book_predictions.json'); +const infile = path.resolve(__dirname, '../predictors-bayesian/predictor-bayesean-book/book_predictor_output.json'); function reformatOutput(predictionObject, callback) { - //onlyAlive will store alive, filtered characters let onlyAlive = {}; onlyAlive.attributes = predictionObject.attributes; onlyAlive.meanBetaExp = predictionObject.meanBetaExp; onlyAlive.characters = {}; - - for(let c of predictionObject.characters) { - if(c.alive == false) continue; - + + for (let c of predictionObject.characters) { + if (c.alive === false) continue; + let newChar = {}; newChar.age = c.age; //newChar.predictedSurvivalAge = c.predictedSurvivalAge; @@ -24,26 +23,31 @@ function reformatOutput(predictionObject, callback) { //newChar.confIntervalHigher = c.confIntervalHigher; //newChar.confIntervalConfidence = c.confIntervalConfidence; newChar.survivalFunctionMean = c.survivalFunctionMean; - + onlyAlive.characters[c.name] = newChar; } - + //onlyAlive is now ready, write it to a JSON //transformer function will reduce precision, since it's not really needed - let json = JSON.stringify(onlyAlive, function(key, val) { - if(val.toPrecision) return +val.toPrecision(3); - else return val; - }, 2); - - fs.writeFile(outfile, json, function(err) { //'../outputs-bayesean/processedOutputBook.json' - if(err) throw err; + let json = JSON.stringify( + onlyAlive, + function(key, val) { + if (val.toPrecision) return +val.toPrecision(3); + else return val; + }, + 2, + ); + + fs.writeFile(outfile, json, function(err) { + //'../outputs-bayesean/processedOutputBook.json' + if (err) throw err; callback(); }); } function reformat(callback) { - fs.readFile(infile, function (err, data) { - if(err) throw err; + fs.readFile(infile, function(err, data) { + if (err) throw err; let predictionObject = JSON.parse(data); reformatOutput(predictionObject, callback); }); @@ -51,4 +55,6 @@ function reformat(callback) { exports.reformatBookOutput = reformat; -reformat(() => {console.log("Postprocessing book predictions complete!");}); \ No newline at end of file +reformat(() => { + console.log('Postprocessing book predictions complete!'); +}); diff --git a/workers/postprocessor-bayesean-show/index.js b/workers/postprocessor-bayesean-show/index.js index 2dd587f..112ef74 100644 --- a/workers/postprocessor-bayesean-show/index.js +++ b/workers/postprocessor-bayesean-show/index.js @@ -1,49 +1,53 @@ -'use strict' +'use strict'; const fs = require('fs'); const path = require('path'); -const outfile = path.resolve(__dirname, "show_predictions.json"); -const infile = path.resolve(__dirname, "../predictor-bayesean-show/show_predictor_output.json"); +const outfile = path.resolve(__dirname, 'show_predictions.json'); +const infile = path.resolve(__dirname, '../predictors-bayesian/predictor-bayesean-show/show_predictor_output.json'); function reformatOutput(predictionObject, callback) { - - //onlyAlive will store alive, filtered characters + // onlyAlive will store alive, filtered characters let onlyAlive = {}; onlyAlive.attributes = predictionObject.attributes; onlyAlive.meanBetaExp = predictionObject.meanBetaExp; onlyAlive.characters = {}; - - for(let c of predictionObject.characters) { - if(c.alive == false) continue; - + + for (let c of predictionObject.characters) { + if (c.alive === false) continue; + let newChar = {}; newChar.livedTo = c.livedTo; - //newChar.predictedSurvivalAge = c.predictedSurvivalAge; - //newChar.confIntervalLower = c.confIntervalLower; - //newChar.confIntervalHigher = c.confIntervalHigher; - //newChar.confIntervalConfidence = c.confIntervalConfidence; + // newChar.predictedSurvivalAge = c.predictedSurvivalAge; + // newChar.confIntervalLower = c.confIntervalLower; + // newChar.confIntervalHigher = c.confIntervalHigher; + // newChar.confIntervalConfidence = c.confIntervalConfidence; newChar.survivalFunctionMean = c.survivalFunctionMean; - + onlyAlive.characters[c.name] = newChar; } - - //onlyAlive is now ready, write it to a JSON - //transformer function will reduce precision, since it's not really needed - let json = JSON.stringify(onlyAlive, function(key, val) { - if(val.toPrecision) return +val.toPrecision(3); - else return val; - }, 2); - - fs.writeFile(outfile, json, function(err) { //'../outputs-bayesean/processedOutputBook.json' - if(err) throw err; + + // onlyAlive is now ready, write it to a JSON + // transformer function will reduce precision, since it's not really needed + let json = JSON.stringify( + onlyAlive, + function(key, val) { + if (val.toPrecision) return +val.toPrecision(3); + else return val; + }, + 2, + ); + + fs.writeFile(outfile, json, function(err) { + // '../outputs-bayesean/processedOutputBook.json' + if (err) throw err; callback(); }); } function reformat(callback) { - fs.readFile(infile, function (err, data) { - if(err) throw err; + fs.readFile(infile, function(err, data) { + if (err) throw err; let predictionObject = JSON.parse(data); reformatOutput(predictionObject, callback); }); @@ -51,4 +55,6 @@ function reformat(callback) { exports.reformatShowOutput = reformat; -reformat(() => {console.log("Postprocessing show predictions complete!");}); \ No newline at end of file +reformat(() => { + console.log('Postprocessing show predictions complete!'); +}); diff --git a/workers/predictor-bayesean-book/index.js b/workers/predictor-bayesean-book/index.js deleted file mode 100644 index d85ff3c..0000000 --- a/workers/predictor-bayesean-book/index.js +++ /dev/null @@ -1,20 +0,0 @@ -'use strict'; - -const {exec} = require('child_process'); -const path = require('path'); - -function runBookPredictor(callback) { - exec("python3 predictor.py", { - "cwd": path.resolve(__dirname) - }, - (error, stdout, stderr) => { - if(error) {throw error;} - console.log(stdout); - console.log(stderr); - callback(); - }); -} - -exports.runBookPredictor = runBookPredictor; - -runBookPredictor(() => {console.log("Prediction complete!");}); \ No newline at end of file diff --git a/workers/predictor-bayesean-book/predictor.py b/workers/predictor-bayesean-book/predictor.py deleted file mode 100644 index a9bf668..0000000 --- a/workers/predictor-bayesean-book/predictor.py +++ /dev/null @@ -1,120 +0,0 @@ -from matplotlib import pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns -import pymc3 as pm -from pymc3.distributions.timeseries import GaussianRandomWalk -from theano import tensor as T -import random -import json -import math -import sys - -infile = "../formatter-bayesean-book/training_book_characters.json" -outfile = "./book_predictor_output.json" - -#read input file -df = pd.read_json(path_or_buf = infile, typ = "frame") - -#get some parameters -num_characters = df.shape[0] -characters = np.arange(num_characters) #vector 1....num_characters - -#set parameters -interval_length = 1 #discretization over interval_length-year intervals -interval_bounds = np.arange (0, df.age.max() + interval_length + 1, interval_length) #vector describing the boundaries of the intervals -n_intervals = interval_bounds.size - 1 #number of intervals, given max age -intervals = np.arange(n_intervals) #indexes of intervals in a vector - -#determine death matrix and exposure matrix -last_period = np.floor((df.age - 0.01) / interval_length).astype(int) #last period where a character was observed - -death = np.zeros((num_characters, n_intervals)) #matrix rows = chars, cols = intervals, cell = 1 if character died in this interval -death[characters, last_period]=df.isDead - -exposure = np.greater_equal.outer(df.age, interval_bounds[:-1])*interval_length #matrix rows=chars, cols=intervals, cell = #years character was exposed to risk in this interval -exposure[characters, last_period] = df.age - interval_bounds[last_period] -exposure=exposure.astype(np.float) #keep it as a float for calculation purposes - -#too many zeroes in the exposure matrix apparently cause a lot of problems, so just replace them with sth very small -filter_func = np.vectorize(lambda v: 1e-200 if v<=0 else v) #assuming a tiny chance of dying after you're dead isn't so bad, is it? -exposure = filter_func(exposure) - -#convert the DataFrame into a numPy array (also exclude columns we don't want to have as training parameters) -df_dropped = df.drop(["age", "isDead", "name"], axis=1) -colNames = df_dropped.columns.values.tolist() #will use later when writing the prediction file -df_num=df_dropped.to_numpy().astype(float) #characters=rows, attributes=cols -num_parameters = df_num.shape[1]; - -SEED = random.randint(1,10000000) #will be used in the sampler -#create the model -with pm.Model() as model: - lambda0 = pm.Gamma('lambda0', mu=0.02, sd=0.02, shape=n_intervals) #this is a vector (base risk to die in a time slice) - beta = pm.Normal('beta', mu=0, sd=1000, shape=num_parameters) #this is a vector (one coefficient per covariate) - lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(T.dot(df_num, beta)), lambda0)) #this is a matrix (risk of character(row) in a time slice(col)) - mu = pm.Deterministic('mu', exposure*lambda_) #this is also a matrix (risk = 0 if character already dead, otherwise same as lambda_) - obs = pm.Poisson('obs', mu, observed=death) - -n_samples = 100 #both should be 1000, 100 for quick testing -n_tune = 100 -acceptance_probability = 0.9 -num_chains = 2 -#now, sample the model -with model: - trace = pm.sample(n_samples, tune = n_tune, random_seed=SEED, chains = num_chains, nuts_kwargs = dict(target_accept=acceptance_probability)) - -# trace = samples for our trained, posterior distribution -# trace['beta'] is a matrix. Rows = all the samples, colums = sampled beta vector -# trace['lambda'] is a matrix, rows = all the samples, cols = sampled chance to die in a given time slice - -def get_dotprodfactors(params): #get the hazard multipliers (not yet exponentiated) for each step of the trace, depending on the parameters - return trace['beta'].dot(np.atleast_2d(params).transpose()) - -def calc_hazard(dotprodfactors): #calculates hazard values for each time slice up to, but not including, time, dependent on the params - return trace['lambda0']*np.exp(dotprodfactors) - -def cum_hazard(hazard): #given hazard-per-timeslice values, calculate cumulative hazard - return (interval_length*hazard).cumsum(axis=-1) - -def survival(hazard): #describes likelihood of surviving the cumulative hazard - return np.exp(-cum_hazard(hazard)) - -def survivalParams(params): #describes survival function distribution (i.e. a set of samples, each being a survival function), given some params - return survival(calc_hazard(get_dotprodfactors(params))) - -def fitAge_greater_equal(survFn, greaterThan): #how many years are equally or more probable than greaterThan? - fits = np.greater_equal(survFn, greaterThan).astype(int).sum(axis=1)*interval_length - return fits - -def fitAge_greater_equal_mean(survFn, greaterThan): - return fitAge_greater_equal(survFn, greaterThan).mean() - -def fitAge_greater_equal_last(survFn, greaterThan): - return fitAge_greater_equal(survFn, greaterThan)[-1] - -#Now construct the output file -predictions = {} #we'll write this dict to a JSON -#predictions["priorHazard"] = trace['lambda0'].mean(axis=0).astype(float).tolist() -predictions["attributes"] = colNames -beta = trace['beta'] # make a mean of all rows in the entire trace, transform the column matrix into a (single-) row matrix and get the row out -#predictions["betaExp"] = np.exp(beta).astype(float).tolist() -predictions["meanBetaExp"] = np.exp(beta.mean(axis=0)).astype(float).tolist() -predictions["characters"] = [] -#now add the survial function for every character -for i in range(0, num_characters): - ch = {} #this dict will represent the character's survival function - ch["name"] = df["name"][i] - ch["alive"] = False if df["isDead"][i] > 0 else True - ch["age"] = df["age"].astype(float)[i] - survFn= survivalParams(df_num[i, :]).astype(float) #take the i-th row of df_num for the character's parameters - #ch["predictedSurvivalAge"] = fitAge_greater_equal(survFn, 0.5).astype(float).tolist() - confidence = 0.8 - #ch["confIntervalLower"] = fitAge_greater_equal(survFn, confidence).astype(float).tolist() - #ch["confIntervalHigher"] = fitAge_greater_equal(survFn, 1-confidence).astype(float).tolist() - #ch["confIntervalConfidence"] = confidence - ch["survivalFunctionMean"] = survFn.mean(axis=0).tolist() - predictions["characters"].append(ch) - -#now write the predictions object to a file -output = open(outfile, 'w') -json.dump(predictions, output, indent=2) \ No newline at end of file diff --git a/workers/predictor-bayesean-show/index.js b/workers/predictor-bayesean-show/index.js deleted file mode 100644 index ce042f7..0000000 --- a/workers/predictor-bayesean-show/index.js +++ /dev/null @@ -1,20 +0,0 @@ -'use strict'; - -const {exec} = require('child_process'); -const path = require('path'); - -function runShowPredictor(callback) { - exec("python3 predictor.py", { - "cwd": path.resolve(__dirname) - }, - (error, stdout, stderr) => { - if(error) {throw error;} - console.log(stdout); - console.log(stderr); - callback(); - }); -} - -exports.runShowPredictor = runShowPredictor; - -runShowPredictor(() => {console.log("Prediction complete!");}); \ No newline at end of file diff --git a/workers/predictor-bayesean-show/predictor.py b/workers/predictor-bayesean-show/predictor.py deleted file mode 100644 index c3044b5..0000000 --- a/workers/predictor-bayesean-show/predictor.py +++ /dev/null @@ -1,127 +0,0 @@ -from matplotlib import pyplot as plt -import numpy as np -import pandas as pd -import seaborn as sns -import pymc3 as pm -from pymc3.distributions.timeseries import GaussianRandomWalk -from theano import tensor as T -import random -import json -import math - -infile = "../formatter-bayesean-show/training_show_characters.json" -outfile = "./show_predictor_output.json" - -#read input file -df = pd.read_json(path_or_buf = infile, typ = "frame") - -df.livedTo += 1; #this is because having died in the n-th season still means you endured the risk of the n-th season - -#get some parameters -num_characters = df.shape[0] -characters = np.arange(num_characters) #vector 1....num_characters - -#set parameters -interval_length = 1 #discretization over interval_length-year intervals -interval_bounds = np.arange (0, df.livedTo.max() + interval_length + 1, interval_length) #vector describing the boundaries of the intervals -n_intervals = interval_bounds.size - 1 #number of intervals, given max livedTo -intervals = np.arange(n_intervals) #indexes of intervals in a vector - -#determine death matrix and exposure matrix -last_period = np.floor((df.livedTo - 0.01) / interval_length).astype(int) #last period where a character was observed - -death = np.zeros((num_characters, n_intervals)) #matrix rows = chars, cols = intervals, cell = 1 if character died in this interval -death[characters, last_period]=df.isDead - -exposure = np.greater_equal.outer(df.livedTo, interval_bounds[:-1])*interval_length #matrix rows=chars, cols=intervals, cell = #years character was exposed to risk in this interval -exposure[characters, last_period] = df.livedTo - interval_bounds[last_period] -exposure=exposure.astype(np.float) #keep it as a float for calculation purposes - -#too many zeroes in the exposure matrix apparently cause a lot of problems, so just replace them with sth very small -filter_func = np.vectorize(lambda v: 1e-200 if v<=0 else v) #assuming a tiny chance of dying after you're dead isn't so bad, is it? -exposure = filter_func(exposure) - -#convert the DataFrame into a numPy array (also exclude columns we don't want to have as training parameters) -df_dropped = df.drop(["livedTo", "isDead", "name"], axis=1) -colNames = df_dropped.columns.values.tolist() #will use later when writing the prediction file -df_num=df_dropped.to_numpy().astype(float) #characters=rows, attributes=cols -num_parameters = df_num.shape[1]; - -SEED = random.randint(1,10000000) #will be used in the sampler -#create the model -with pm.Model() as model: - lambda0 = pm.Gamma('lambda0', mu=0.15, sd=0.1, shape=1) #this is a scalar (base chance to die per episode) - beta = pm.Normal('beta', mu=0, sd=1000, shape=num_parameters) #this is a vector (one coefficient per covariate) - lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(T.dot(df_num, beta)), lambda0)) #this is a matrix (risk of character(row) in a time slice(col)) - mu = pm.Deterministic('mu', exposure*lambda_) #this is also a matrix (risk = 0 if character already dead, otherwise same as lambda_) - obs = pm.Poisson('obs', mu, observed=death) - -n_samples = 100 #both should be 1000, 100 for quick testing -n_tune = 100 -acceptance_probability = 0.9 -num_chains = 2 -#now, sample the model -with model: - trace = pm.sample(n_samples, tune = n_tune, random_seed=SEED, chains = num_chains, nuts_kwargs = dict(target_accept=acceptance_probability)) - -#print(trace['beta'].mean(axis = 0)) -#print(trace['lambda0']) - -beta = trace['beta'] #rows = samples, columns = coefficients -lambda0 = trace['lambda0'] #rows = samples, single column = base risk per episode - -num_slices = 50 #since lambda0 is the same for all slices, this indicates how far into the future the model must look - -def get_dotprodfactors(params): #get the hazard multipliers (not yet exponentiated) for each sample of the trace, depending on the parameters - return trace['beta'].dot(np.atleast_2d(params).transpose()) #mutliple dot products => matrix multiplication - -def calc_hazard(dotprodfactors): #calculates hazard values for each time slice up to, but not including, time, dependent on the params - return (trace['lambda0'] * np.ones(num_slices)) * np.exp(dotprodfactors) - -def cum_hazard(hazard): #given hazard-per-timeslice values, calculate cumulative hazard - return (interval_length*hazard).cumsum(axis=-1) - -def survival(hazard): #describes likelihood of surviving the cumulative hazard - return np.exp(-cum_hazard(hazard)) - -def survivalParams(params): #describes survival function distribution (i.e. a set of samples, each being a survival function), given some params - return survival(calc_hazard(get_dotprodfactors(params))) - -def fitAge_greater_equal(survFn, greaterThan): #how many years are equally or more probable than greaterThan? - fits = np.greater_equal(survFn, greaterThan).astype(int).sum(axis=1)*interval_length - return fits - -def fitAge_greater_equal_mean(survFn, greaterThan): - return fitAge_greater_equal(survFn, greaterThan).mean() - -def fitAge_greater_equal_last(survFn, greaterThan): - return fitAge_greater_equal(survFn, greaterThan)[-1] - -#Now construct the output file -predictions = {} #we'll write this dict to a JSON -#predictions["priorHazard"] = trace['lambda0'].astype(float).tolist() -predictions["attributes"] = colNames -beta = trace['beta'] # make a mean of all rows in the entire trace, transform the column matrix into a (single-) row matrix and get the row out -#predictions["betaExp"] = np.exp(beta).astype(float).tolist() -predictions["meanBetaExp"] = np.exp(beta.mean(axis=0)).astype(float).tolist() -predictions["characters"] = [] -#now add the survial function for every character -for i in range(0, num_characters): - ch = {} #this dict will represent the character's survival function - ch["name"] = df["name"][i] - ch["alive"] = False if df["isDead"][i] > 0 else True - ch["livedTo"] = df["livedTo"].astype(float)[i] - survFn= survivalParams(df_num[i, :]).astype(float) #take the i-th row of df_num for the character's parameters - fitAge50 = fitAge_greater_equal(survFn, 0.5).astype(float) - #ch["predictedSurvivalAge"] = fitAge50.tolist() - #ch["likelihoodSeason8"] = (np.sum(np.greater_equal(fitAge50, 8).astype(float)))/(n_samples*num_chains) - confidence = 0.8 - #ch["confIntervalLower"] = fitAge_greater_equal(survFn, confidence).astype(float).tolist() - #ch["confIntervalHigher"] = fitAge_greater_equal(survFn, 1-confidence).astype(float).tolist() - #ch["confIntervalConfidence"] = confidence - ch["survivalFunctionMean"] = survFn.mean(axis=0).tolist() - predictions["characters"].append(ch) - -#now write the predictions object to a file -output = open(outfile, 'w') -json.dump(predictions, output, indent=2) diff --git a/workers/predictor-bayesean-book/book_predictor_output.json b/workers/predictors-bayesian/predictor-bayesean-book/book_predictor_output.json similarity index 99% rename from workers/predictor-bayesean-book/book_predictor_output.json rename to workers/predictors-bayesian/predictor-bayesean-book/book_predictor_output.json index ebe1cdc..5416730 100644 --- a/workers/predictor-bayesean-book/book_predictor_output.json +++ b/workers/predictors-bayesian/predictor-bayesean-book/book_predictor_output.json @@ -52896,4 +52896,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/workers/predictors-bayesian/predictor-bayesean-book/index.js b/workers/predictors-bayesian/predictor-bayesean-book/index.js new file mode 100644 index 0000000..65970bb --- /dev/null +++ b/workers/predictors-bayesian/predictor-bayesean-book/index.js @@ -0,0 +1,27 @@ +'use strict'; + +const { exec } = require('child_process'); +const path = require('path'); + +function runBookPredictor(callback) { + exec( + 'python3 predictor.py', + { + cwd: path.resolve(__dirname), + }, + (error, stdout, stderr) => { + if (error) { + throw error; + } + console.log(stdout); + console.log(stderr); + callback(); + }, + ); +} + +exports.runBookPredictor = runBookPredictor; + +runBookPredictor(() => { + console.log('Prediction complete!'); +}); diff --git a/workers/predictors-bayesian/predictor-bayesean-book/predictor.py b/workers/predictors-bayesian/predictor-bayesean-book/predictor.py new file mode 100644 index 0000000..6f63ccd --- /dev/null +++ b/workers/predictors-bayesian/predictor-bayesean-book/predictor.py @@ -0,0 +1,120 @@ +from matplotlib import pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import pymc3 as pm +from pymc3.distributions.timeseries import GaussianRandomWalk +from theano import tensor as T +import random +import json +import math +import sys + +infile = "../../formatter-bayesean-book/training_book_characters.json" +outfile = "./book_predictor_output.json" + +# read input file +df = pd.read_json(path_or_buf = infile, typ = "frame") + +# get some parameters +num_characters = df.shape[0] +characters = np.arange(num_characters) # vector 1....num_characters + +# set parameters +interval_length = 1 # discretization over interval_length-year intervals +interval_bounds = np.arange (0, df.age.max() + interval_length + 1, interval_length) # vector describing the boundaries of the intervals +n_intervals = interval_bounds.size - 1 # number of intervals, given max age +intervals = np.arange(n_intervals) # indexes of intervals in a vector + +# determine death matrix and exposure matrix +last_period = np.floor((df.age - 0.01) / interval_length).astype(int) # last period where a character was observed + +death = np.zeros((num_characters, n_intervals)) # matrix rows = chars, cols = intervals, cell = 1 if character died in this interval +death[characters, last_period]=df.isDead + +exposure = np.greater_equal.outer(df.age, interval_bounds[:-1])*interval_length # matrix rows=chars, cols=intervals, cell = number of years character was exposed to risk in this interval +exposure[characters, last_period] = df.age - interval_bounds[last_period] +exposure=exposure.astype(np.float) # keep it as a float for calculation purposes + +# too many zeroes in the exposure matrix apparently cause a lot of problems, so just replace them with sth very small +filter_func = np.vectorize(lambda v: 1e-200 if v<=0 else v) # assuming a tiny chance of dying after you're dead isn't so bad, is it? +exposure = filter_func(exposure) + +# convert the DataFrame into a numPy array (also exclude columns we don't want to have as training parameters) +df_dropped = df.drop(["age", "isDead", "name"], axis=1) +colNames = df_dropped.columns.values.tolist() # will use later when writing the prediction file +df_num=df_dropped.to_numpy().astype(float) # characters=rows, attributes=cols +num_parameters = df_num.shape[1]; + +SEED = random.randint(1,10000000) # will be used in the sampler +# create the model +with pm.Model() as model: + lambda0 = pm.Gamma('lambda0', mu=0.02, sd=0.02, shape=n_intervals) # this is a vector (base risk to die in a time slice) + beta = pm.Normal('beta', mu=0, sd=1000, shape=num_parameters) # this is a vector (one coefficient per covariate) + lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(T.dot(df_num, beta)), lambda0)) # this is a matrix (risk of character(row) in a time slice(col)) + mu = pm.Deterministic('mu', exposure*lambda_) # this is also a matrix (risk = 0 if character already dead, otherwise same as lambda_) + obs = pm.Poisson('obs', mu, observed=death) + +n_samples = 100 # both should be 1000, 100 for quick testing +n_tune = 100 +acceptance_probability = 0.9 +num_chains = 2 +# now, sample the model +with model: + trace = pm.sample(n_samples, tune = n_tune, random_seed=SEED, chains = num_chains, nuts_kwargs = dict(target_accept=acceptance_probability)) + +# trace = samples for our trained, posterior distribution +# trace['beta'] is a matrix. Rows = all the samples, colums = sampled beta vector +# trace['lambda'] is a matrix, rows = all the samples, cols = sampled chance to die in a given time slice + +def get_dotprodfactors(params): # get the hazard multipliers (not yet exponentiated) for each step of the trace, depending on the parameters + return trace['beta'].dot(np.atleast_2d(params).transpose()) + +def calc_hazard(dotprodfactors): # calculates hazard values for each time slice up to, but not including, time, dependent on the params + return trace['lambda0']*np.exp(dotprodfactors) + +def cum_hazard(hazard): # given hazard-per-timeslice values, calculate cumulative hazard + return (interval_length*hazard).cumsum(axis=-1) + +def survival(hazard): # describes likelihood of surviving the cumulative hazard + return np.exp(-cum_hazard(hazard)) + +def survivalParams(params): # describes survival function distribution (i.e. a set of samples, each being a survival function), given some params + return survival(calc_hazard(get_dotprodfactors(params))) + +def fitAge_greater_equal(survFn, greaterThan): # how many years are equally or more probable than greaterThan? + fits = np.greater_equal(survFn, greaterThan).astype(int).sum(axis=1)*interval_length + return fits + +def fitAge_greater_equal_mean(survFn, greaterThan): + return fitAge_greater_equal(survFn, greaterThan).mean() + +def fitAge_greater_equal_last(survFn, greaterThan): + return fitAge_greater_equal(survFn, greaterThan)[-1] + +# Now construct the output file +predictions = {} # we'll write this dict to a JSON +# predictions["priorHazard"] = trace['lambda0'].mean(axis=0).astype(float).tolist() +predictions["attributes"] = colNames +beta = trace['beta'] # make a mean of all rows in the entire trace, transform the column matrix into a (single-) row matrix and get the row out +# predictions["betaExp"] = np.exp(beta).astype(float).tolist() +predictions["meanBetaExp"] = np.exp(beta.mean(axis=0)).astype(float).tolist() +predictions["characters"] = [] +# now add the survial function for every character +for i in range(0, num_characters): + ch = {} # this dict will represent the character's survival function + ch["name"] = df["name"][i] + ch["alive"] = False if df["isDead"][i] > 0 else True + ch["age"] = df["age"].astype(float)[i] + survFn= survivalParams(df_num[i, :]).astype(float) # take the i-th row of df_num for the character's parameters + # ch["predictedSurvivalAge"] = fitAge_greater_equal(survFn, 0.5).astype(float).tolist() + confidence = 0.8 + # ch["confIntervalLower"] = fitAge_greater_equal(survFn, confidence).astype(float).tolist() + # ch["confIntervalHigher"] = fitAge_greater_equal(survFn, 1-confidence).astype(float).tolist() + # ch["confIntervalConfidence"] = confidence + ch["survivalFunctionMean"] = survFn.mean(axis=0).tolist() + predictions["characters"].append(ch) + +# now write the predictions object to a file +output = open(outfile, 'w') +json.dump(predictions, output, indent=2) diff --git a/workers/predictors-bayesian/predictor-bayesean-show/index.js b/workers/predictors-bayesian/predictor-bayesean-show/index.js new file mode 100644 index 0000000..1929e96 --- /dev/null +++ b/workers/predictors-bayesian/predictor-bayesean-show/index.js @@ -0,0 +1,27 @@ +'use strict'; + +const { exec } = require('child_process'); +const path = require('path'); + +function runShowPredictor(callback) { + exec( + 'python3 predictor.py', + { + cwd: path.resolve(__dirname), + }, + (error, stdout, stderr) => { + if (error) { + throw error; + } + console.log(stdout); + console.log(stderr); + callback(); + }, + ); +} + +exports.runShowPredictor = runShowPredictor; + +runShowPredictor(() => { + console.log('Prediction complete!'); +}); diff --git a/workers/predictors-bayesian/predictor-bayesean-show/predictor.py b/workers/predictors-bayesian/predictor-bayesean-show/predictor.py new file mode 100644 index 0000000..3af5abf --- /dev/null +++ b/workers/predictors-bayesian/predictor-bayesean-show/predictor.py @@ -0,0 +1,127 @@ +from matplotlib import pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import pymc3 as pm +from pymc3.distributions.timeseries import GaussianRandomWalk +from theano import tensor as T +import random +import json +import math + +infile = "../../formatter-bayesean-show/training_show_characters.json" +outfile = "./show_predictor_output.json" + +# read input file +df = pd.read_json(path_or_buf = infile, typ = "frame") + +df.livedTo += 1; # this is because having died in the n-th season still means you endured the risk of the n-th season + +# get some parameters +num_characters = df.shape[0] +characters = np.arange(num_characters) # vector 1....num_characters + +# set parameters +interval_length = 1 # discretization over interval_length-year intervals +interval_bounds = np.arange (0, df.livedTo.max() + interval_length + 1, interval_length) # vector describing the boundaries of the intervals +n_intervals = interval_bounds.size - 1 # number of intervals, given max livedTo +intervals = np.arange(n_intervals) # indexes of intervals in a vector + +# determine death matrix and exposure matrix +last_period = np.floor((df.livedTo - 0.01) / interval_length).astype(int) # last period where a character was observed + +death = np.zeros((num_characters, n_intervals)) # matrix rows = chars, cols = intervals, cell = 1 if character died in this interval +death[characters, last_period]=df.isDead + +exposure = np.greater_equal.outer(df.livedTo, interval_bounds[:-1])*interval_length # matrix rows=chars, cols=intervals, cell = number of years character was exposed to risk in this interval +exposure[characters, last_period] = df.livedTo - interval_bounds[last_period] +exposure=exposure.astype(np.float) # keep it as a float for calculation purposes + +# too many zeroes in the exposure matrix apparently cause a lot of problems, so just replace them with sth very small +filter_func = np.vectorize(lambda v: 1e-200 if v<=0 else v) # assuming a tiny chance of dying after you're dead isn't so bad, is it? +exposure = filter_func(exposure) + +# convert the DataFrame into a numPy array (also exclude columns we don't want to have as training parameters) +df_dropped = df.drop(["livedTo", "isDead", "name"], axis=1) +colNames = df_dropped.columns.values.tolist() # will use later when writing the prediction file +df_num=df_dropped.to_numpy().astype(float) # characters=rows, attributes=cols +num_parameters = df_num.shape[1]; + +SEED = random.randint(1,10000000) # will be used in the sampler +# create the model +with pm.Model() as model: + lambda0 = pm.Gamma('lambda0', mu=0.15, sd=0.1, shape=1) # this is a scalar (base chance to die per episode) + beta = pm.Normal('beta', mu=0, sd=1000, shape=num_parameters) # this is a vector (one coefficient per covariate) + lambda_ = pm.Deterministic('lambda_', T.outer(T.exp(T.dot(df_num, beta)), lambda0)) # this is a matrix (risk of character(row) in a time slice(col)) + mu = pm.Deterministic('mu', exposure*lambda_) # this is also a matrix (risk = 0 if character already dead, otherwise same as lambda_) + obs = pm.Poisson('obs', mu, observed=death) + +n_samples = 100 # both should be 1000, 100 for quick testing +n_tune = 100 +acceptance_probability = 0.9 +num_chains = 2 +# now, sample the model +with model: + trace = pm.sample(n_samples, tune = n_tune, random_seed=SEED, chains = num_chains, nuts_kwargs = dict(target_accept=acceptance_probability)) + +# print(trace['beta'].mean(axis = 0)) +# print(trace['lambda0']) + +beta = trace['beta'] # rows = samples, columns = coefficients +lambda0 = trace['lambda0'] # rows = samples, single column = base risk per episode + +num_slices = 50 # since lambda0 is the same for all slices, this indicates how far into the future the model must look + +def get_dotprodfactors(params): # get the hazard multipliers (not yet exponentiated) for each sample of the trace, depending on the parameters + return trace['beta'].dot(np.atleast_2d(params).transpose()) # mutliple dot products => matrix multiplication + +def calc_hazard(dotprodfactors): # calculates hazard values for each time slice up to, but not including, time, dependent on the params + return (trace['lambda0'] * np.ones(num_slices)) * np.exp(dotprodfactors) + +def cum_hazard(hazard): # given hazard-per-timeslice values, calculate cumulative hazard + return (interval_length*hazard).cumsum(axis=-1) + +def survival(hazard): # describes likelihood of surviving the cumulative hazard + return np.exp(-cum_hazard(hazard)) + +def survivalParams(params): # describes survival function distribution (i.e. a set of samples, each being a survival function), given some params + return survival(calc_hazard(get_dotprodfactors(params))) + +def fitAge_greater_equal(survFn, greaterThan): # how many years are equally or more probable than greaterThan? + fits = np.greater_equal(survFn, greaterThan).astype(int).sum(axis=1)*interval_length + return fits + +def fitAge_greater_equal_mean(survFn, greaterThan): + return fitAge_greater_equal(survFn, greaterThan).mean() + +def fitAge_greater_equal_last(survFn, greaterThan): + return fitAge_greater_equal(survFn, greaterThan)[-1] + +# Now construct the output file +predictions = {} # we'll write this dict to a JSON +# predictions["priorHazard"] = trace['lambda0'].astype(float).tolist() +predictions["attributes"] = colNames +beta = trace['beta'] # make a mean of all rows in the entire trace, transform the column matrix into a (single-) row matrix and get the row out +# predictions["betaExp"] = np.exp(beta).astype(float).tolist() +predictions["meanBetaExp"] = np.exp(beta.mean(axis=0)).astype(float).tolist() +predictions["characters"] = [] +# now add the survial function for every character +for i in range(0, num_characters): + ch = {} # this dict will represent the character's survival function + ch["name"] = df["name"][i] + ch["alive"] = False if df["isDead"][i] > 0 else True + ch["livedTo"] = df["livedTo"].astype(float)[i] + survFn= survivalParams(df_num[i, :]).astype(float) # take the i-th row of df_num for the character's parameters + fitAge50 = fitAge_greater_equal(survFn, 0.5).astype(float) + # ch["predictedSurvivalAge"] = fitAge50.tolist() + # ch["likelihoodSeason8"] = (np.sum(np.greater_equal(fitAge50, 8).astype(float)))/(n_samples*num_chains) + confidence = 0.8 + # ch["confIntervalLower"] = fitAge_greater_equal(survFn, confidence).astype(float).tolist() + # ch["confIntervalHigher"] = fitAge_greater_equal(survFn, 1-confidence).astype(float).tolist() + # ch["confIntervalConfidence"] = confidence + ch["survivalFunctionMean"] = survFn.mean(axis=0).tolist() + predictions["characters"].append(ch) + +# now write the predictions object to a file +output = open(outfile, 'w') +json.dump(predictions, output, indent=2) diff --git a/workers/predictor-bayesean-show/show_predictor_output.json b/workers/predictors-bayesian/predictor-bayesean-show/show_predictor_output.json similarity index 99% rename from workers/predictor-bayesean-show/show_predictor_output.json rename to workers/predictors-bayesian/predictor-bayesean-show/show_predictor_output.json index 1efab47..51f417b 100644 --- a/workers/predictor-bayesean-show/show_predictor_output.json +++ b/workers/predictors-bayesian/predictor-bayesean-show/show_predictor_output.json @@ -12011,4 +12011,4 @@ ] } ] -} \ No newline at end of file +} diff --git a/workers/predictors-bayesian/requirements.txt b/workers/predictors-bayesian/requirements.txt new file mode 100644 index 0000000..b591319 --- /dev/null +++ b/workers/predictors-bayesian/requirements.txt @@ -0,0 +1,6 @@ +matplotlib +numpy +pandas +seaborn +pymc3 +theano diff --git a/workers/uploader-attributes-bayesean/index.js b/workers/uploader-attributes-bayesean/index.js index 1b128b7..17ce3c6 100644 --- a/workers/uploader-attributes-bayesean/index.js +++ b/workers/uploader-attributes-bayesean/index.js @@ -1,35 +1,31 @@ const utils = require('../common/utils'); -const config = require('../common/config'); async function updatePredictions(callback) { - let [bookPred, showPred] = await Promise.all([ - utils.loadBayeseanPredictionsBook(), - utils.loadBayeseanPredictionsShow(), - ]); + let [bookPred, showPred] = await Promise.all([utils.loadBayeseanPredictionsBook(), utils.loadBayeseanPredictionsShow()]); const updater = await new utils.APIUpdater().init(); - - //book coefficients + + // book coefficients let bookAttrs = bookPred.attributes; let bookMeanBetaExp = bookPred.meanBetaExp; let bookCoefficients = {}; - for (let i=0; i config.GOT_CURRENT_YEAR_BOOK + 1 - birth) { - plod = (1 - ch.survivalFunctionMean[config.GOT_CURRENT_YEAR_BOOK + 1 - birth]); + plod = 1 - ch.survivalFunctionMean[config.GOT_CURRENT_YEAR_BOOK + 1 - birth]; } - + let survFnStart = config.GOT_CURRENT_YEAR_BOOK; let survFn = ch.survivalFunctionMean.slice(config.GOT_CURRENT_YEAR_BOOK - birth, config.GOT_CURRENT_YEAR_BOOK - birth + 21); - - //update predictions online + + // update predictions online console.log(await updater.updatePLODLongevity('book', name, survFn, survFnStart, plod)); } - - //show coefficients + + // show coefficients let showAttrs = showPred.attributes; let showMeanBetaExp = showPred.meanBetaExp; let showCoefficients = {}; - for (let i=0; i