No Description
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2017-07-27-jobify-helpers-colors.R 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. #'@title Liest json-Dateien ein
  2. #'@description Liest json-Artige bson-Datein, generiert via Mongoexport.
  3. #'@param path Der Filepath mit / und dem Dateinamen mit der Endung .json am Ende
  4. #'@return Dataframe
  5. #'@export readMongo
  6. readMongo <- function(path = "C:/Users/Frederik Gremler/Documents/recent-mongo/young_professional.json"){
  7. suppressPackageStartupMessages(require(jsonlite))
  8. dataframe <- stream_in(file(path))
  9. return(dataframe)
  10. }
  11. #'@title Listen von Listen / Dataframes in Dataframes auslesen
  12. #'@description Liest aus einer Liste von Listen, die Daten heraus, sodass sie bearbeitet werden können
  13. #'@param x Eine Liste aus Listen, die nun zu einem Vektor werden Rsoll
  14. #'@param type Welche Art von Daten wird ausgelesen? Z.B.: "numeric","character", default ist "numeric"
  15. #'@return Ein Vektor
  16. #'@export readListofLists
  17. readListofLists <- function (x, type = "numeric" ){
  18. newdataframe <- vector("numeric", length(x))
  19. for (i in 1:length(x)){
  20. newdataframe[[i]]<- x[[i]][length(x[[i]])]
  21. }
  22. return(newdataframe)
  23. }
  24. #'@title Funktionen zu Listen von Dataframes
  25. #'@description Wendet eine Funktion auf eine Liste von Datensätzen (z.B. Starratings im Kompetenzbaum) an
  26. #'@param originallist Eine Liste von Listen, auf die die Funktion angewand werden soll
  27. #'@param column Die Spalte, der Dataframes, auf die die Funktion angewandt werden soll als string ("string")
  28. #'@param .f Die Funktion (z.B. mean)
  29. #'@param ... Weitere Argumente für die Funktion (z.B. na.rm = T)
  30. #'@param type String, der festlegt welchen Typ das Outputformat hat. Eigentlich immer default: "numeric"
  31. #'@return Ein Vektor
  32. #'@export functionToListOfLists
  33. functionToListOfLists <- function(originallist, column, .f, ..., type = "numeric") {
  34. #Vektor erstellen damit for loop Ergebnisse speichern kann
  35. newdataframe <- vector(mode = type,length = length(originallist))
  36. for (i in 1:length(originallist)){
  37. newdataframe[[i]] <- .f(originallist[[i]][,column], ...)
  38. }
  39. return(newdataframe)
  40. }
  41. ##FUER DEN UMGANG MTI DEN FERTIG AUSGELESENEN DATEN
  42. #'@title Entferne NULLS aus Listen
  43. #'@description Funktion, die NULL aus Listen entfernt (sie aber nicht durch NA ersetzt!) Hierfür dient die Funktion nullToNA.
  44. #'@param x Eine Liste mit NULLS
  45. #'@return Die Liste ohne Nullen
  46. #'@export rmNull
  47. rmNull <- function(x) {
  48. x <- Filter(Negate(is.null), x)
  49. lapply(x, function(x) if (is.list(x)) rmNull(x) else x)
  50. return(x)
  51. }
  52. #'@title NULLs zu NA's
  53. #'@description Funktion, die NULLs in Listen zu NA's konvertiert
  54. #'@param x Eine Liste (oder ein Vektor) mit NULLs
  55. #'@return Die Liste ohne Nullen
  56. #'@export nullToNA
  57. nullToNA <- function(x) {
  58. x[sapply(x, is.null)] <- NA
  59. return(x)
  60. }
  61. #'@title Exclude Tags
  62. #'@description Entfernt Beobachtungen mit dem angegebenen Tag aus dem Datensatz.
  63. #' Sollen mehrere Tags entfernt werden, muss die Funktion zwei Mal genutzt werden:
  64. #' einmal mit jedem Tag.
  65. #'@param dataframe Ein Dataframe mit der Spalte "match_group"
  66. #'@param tag Zu entfernender Tag (default: "test_neu")
  67. #'@return Dataframe ohne den Tag
  68. #'@export excludeTag
  69. excludeTag <- function(dataframe, tag = "test_neu"){
  70. if (class(tag) != "character") {
  71. stop("Tag hast to be set in quotes ('') and has to be a string. Try again!")
  72. }
  73. dataframe[dataframe$match_groups!=tag,]
  74. }
  75. #'@title Calculate age from birthdate
  76. #'@description Berechnet aktuelles Alter aus einem Vektor aus Startdaten (z.B. aus Starhunter)
  77. #'@param birthDate Datumsvektor im Format YYYY-MM-DD
  78. #'@param refDate Das Datum, zu dem das Alter berechnet wird. Default ist Sys.Date(): Das aktuelle Datum
  79. #'@return Ein Vektor der Länge von birthDate mit dem Alter.
  80. #'@export calcAge
  81. calcAge <- function(birthDate, refDate = Sys.Date()) {
  82. require(lubridate)
  83. period <- as.period(interval(birthDate, refDate),
  84. unit = "year")
  85. return(period$year)
  86. }
  87. #'@title Get gender of yps
  88. #'@description Nimmt die Anredespalte des YP-Dataframes und konvertiert sie in männlich weiblich
  89. #' Strings deiner Wahl
  90. #'@param ypdataframe Ein Dataframe mit der Spalte "title"
  91. #'@param men Der String, der für Männer ausgegeben wird, default: "m"
  92. #'@param women Der String, der für Frauen ausgegeben wird, default: "w"
  93. #'@return Eine Spalte, mit den entsprechenden Strings für männlich und weiblich
  94. #'@export getGender
  95. getGender <- function(ypdataframe, men = "m", women = "w") {
  96. if("title" %in% colnames(ypdataframe)==F){
  97. warning("Your dataframe does not have the column 'title' which is needed to do this")
  98. }else{
  99. ypdataframe$gender <- NA
  100. ypdataframe$gender[ypdataframe$title=="Frau"]<- men
  101. ypdataframe$gender[ypdataframe$title=="Herr"]<- women
  102. return(ypdataframe$gender)
  103. }
  104. }
  105. #'@title Get single answers of personality scale
  106. #'@description Isoliert die Antworten auf die Big5 Persönlichkeitsfragen in einem Datensatz
  107. #' und konvertiert die Antwortstrings zu verwertbaren numerischen Werten.
  108. #'@param dataframe Der YP-Dataframe mit "answers" und darin den Persönlichkeitsfragen
  109. #'@return Ein Dataframe mit den gesammelten Antworten auf Persönlichkeitsfragen
  110. #'@export getPersonalityAnswers
  111. #for loop, iterating over the data frame and getting the "values" columns
  112. getPersonalityAnswers <- function(dataframe) {
  113. #stop if data frame is not given
  114. if(class(dataframe)!="data.frame"){
  115. stop("Be sure to give the right (YP) data.frame to the function")
  116. }
  117. #get the right list of dataframes
  118. persUniItems <- dplyr::select(dataframe$answers, dplyr::contains("q3"))
  119. #needed later in order to have useful names of the variables
  120. variablenames <- names(dplyr::select(dataframe$answers, dplyr::contains("q3")))
  121. #correct length of dataframe where the personality data will be saved
  122. persUniItems2 <- data.frame(1:NROW(dataframe))
  123. #getting the values columns where the selected answers are stored
  124. for(i in 1:length(persUniItems)){
  125. persUniItems2[[i]]<-persUniItems[[i]][,"values"]
  126. }
  127. #Packages are needed to work with the strings
  128. require("stringr")
  129. require("data.table")
  130. #strings (characters) are created so I can split them up
  131. persUniItems2 <- purrr::map_df(persUniItems2, as.character)
  132. #using data.table::tstrsplit to split the strings creating new columns
  133. persUniItems3 <- purrr::map(persUniItems2, data.table::tstrsplit, "-", fixed = T)
  134. #only getting the second column that has the values in there
  135. #saving those to a list
  136. persUniItems4 <- list()
  137. #for loop getting the second column
  138. for(i in 1:length(persUniItems3)){
  139. persUniItems4[[i]]<-persUniItems3[[i]][[2]]
  140. }
  141. #converting the list into a data.frame and the values to numeric (instead of character)
  142. persUniItems5 <- as.data.frame(persUniItems4, row.names = NULL)
  143. persUniItems5 <- purrr::map_df(persUniItems5, as.numeric)
  144. #adding the right column names
  145. colnames(persUniItems5)<-variablenames
  146. return(persUniItems5)
  147. }
  148. #'@title Get answers to questions on potential (Q5)
  149. #'@description Isoliert die Antworten auf die Potenzialfragen in einem Datensatz
  150. #' und konvertiert die Antwortstrings zu verwertbaren numerischen Werten.
  151. #'@param dataframe Der YP-Dataframe mit "answers" und darin den Potenzialfragen
  152. #'@return Ein Dataframe mit den gesammelten Antworten auf Potenzialfragen
  153. #'@export getPotentialAnswers
  154. getPotentialAnswers <- function(dataframe) {
  155. #stop if data frame is not given
  156. if(class(dataframe)!="data.frame"){
  157. stop("Be sure to give the right (YP) data.frame to the function")
  158. }
  159. #get the right list of dataframes
  160. potUniItems <- dplyr::select(dataframe$answers, dplyr::contains("q5"))
  161. potUniItems <- dplyr::select(potUniItems, -yp_q5_potentialweight)
  162. #needed later in order to have useful names of the variables
  163. variablenames <- names(dplyr::select(potUniItems, dplyr::contains("q5")))
  164. #correct length of dataframe where the potonality data will be saved
  165. potUniItems2 <- data.frame(1:NROW(dataframe))
  166. #getting the values columns where the selected answers are stored
  167. for(i in 1:length(potUniItems)){
  168. potUniItems2[[i]]<-potUniItems[[i]][,"values"]
  169. }
  170. #Packages are needed to work with the strings
  171. require("stringr")
  172. require("data.table")
  173. #strings (characters) are created so I can split them up
  174. potUniItems2 <- purrr::map_df(potUniItems2, as.character)
  175. #using data.table::tstrsplit to split the strings creating new columns
  176. potUniItems3 <- purrr::map(potUniItems2, data.table::tstrsplit, "-", fixed = T)
  177. #only getting the second column that has the values in there
  178. #saving those to a list
  179. potUniItems4 <- list()
  180. #for loop getting the second column
  181. for(i in 1:length(potUniItems3)){
  182. potUniItems4[[i]]<-potUniItems3[[i]][[2]]
  183. }
  184. #converting the list into a data.frame and the values to numeric (instead of character)
  185. potUniItems5 <- as.data.frame(potUniItems4, row.names = NULL)
  186. potUniItems5 <- purrr::map_df(potUniItems5, as.numeric)
  187. #adding the right column names
  188. colnames(potUniItems5)<-variablenames
  189. return(potUniItems5)
  190. }
  191. #'@title Get latest grade-averages from education
  192. #'@description Takes the education column in a YP-Dataframe, extracts the grade average
  193. #' then takes the last grade average in the list. The final result can be merged with cbind
  194. #'@param ypdataframe YP-Dataframe with variable "education"
  195. #'@return Returns numeric vector with final grades
  196. #'@export getGradeAverage
  197. getGradeAverage <- function(ypdataframe){
  198. education <- list()
  199. for (i in 1:length(ypdataframe$education)){
  200. education[[i]] <- ypdataframe$education[[i]]$final_grade
  201. }
  202. education <- nullToNA(education)
  203. for (i in 1:length(education)){
  204. education[[i]]<- education[[i]][length(education[[i]])]
  205. }
  206. education <- as.numeric(education)
  207. education[education == 0] <- NA
  208. return(education)
  209. }
  210. #'@title Is a YP currently studying or not?
  211. #'@description Takes the list of education dataframes stored in the mongo-yp-dataframe
  212. #' finds out whether on of them is there current degree (column current_study) and if this is the
  213. #' case, sets a new variable to "studying", if there is no current-degree but the YP
  214. #' has a non-empty dataframe he is graduated.
  215. #'@details Right now, this does not consider drop-outs. To-Do!
  216. #'@param ypdataframe The young_professional dataframe from mongo with the column "education".
  217. #'@return Returns a column for the ypdataframe with the the categories "studying" and "not studying"
  218. #' If not specified, this column will be named "working".
  219. #'@export getStudyStatus
  220. getStudyStatus <- function(ypdataframe = yp){
  221. if(any(colnames(ypdataframe)=="education")==F){
  222. stop("The given dataframe does not have a column 'education' with yp-education-data. Give me one and try again!")
  223. }
  224. ypdataframe$education <- nullToNA(ypdataframe$education)
  225. #set new variable to NA for everyone so the for loop can work with it
  226. ypdataframe$working <- NA
  227. #foor loop that iterates over the education dataframe for every individual
  228. for(i in 1:length(ypdataframe$education)){
  229. if(any(ypdataframe$education[[i]]$current_study==T)==T){#if current-study is true in the user's dataframe...
  230. # print(i)
  231. ypdataframe$working[[i]]<-"studying"
  232. #the working variable will be set to studying for him
  233. } else if(all(ypdataframe$education[[i]]$current_study==F)==T){#if not
  234. ypdataframe$working[[i]]<- "graduated"
  235. #it will be set to "graduated"
  236. }
  237. }
  238. return(ypdataframe$working)
  239. }
  240. #'@title What is the current study-degree of a YP?
  241. #'@description Takes the list of education dataframes stored in the mongo-yp-dataframe
  242. #' finds out whether on of them is there current degree (column current_study) and if this is the
  243. #' case, sets a new variable with that degree, if there is no current-degree but the YP
  244. #' has a non-empty dataframe he has "no current degree".
  245. #'@details Right now, this does not consider drop-outs. Moreover, it only counts your degree
  246. #' if you are studying RIGHT NOW.
  247. #'@param ypdataframe The young_professional dataframe from mongo with the column "education".
  248. #'@return Returns a column to the ypdataframe that specifies the CURRENT degree, or that
  249. #' there is no current degree. Or NA, if the yp's education dataframe is empty (i.e. probably unfilled)
  250. #'@export getStudyDegree
  251. getStudyDegree <- function(ypdataframe = yp){
  252. #MASTER AND BACHELOR
  253. ypdataframe$currentdegree <- NA
  254. for(i in 1:length(ypdataframe$education)){
  255. if(any(ypdataframe$education[[i]]$current_study==TRUE)==T){
  256. ypdataframe$currentdegree[i] <- ypdataframe$education[[i]]$final_degree[which(ypdataframe$education[[i]]$current_study==T)]
  257. }else if(all(ypdataframe$education[[i]]$current_study==F)==T){
  258. ypdataframe$currentdegree[[i]] <- "No current degree"
  259. }
  260. #print warning if several true degrees:
  261. if(length(which(ypdataframe$education[[i]]$current_study==T))!=1){
  262. print("The following YP hase given more than one current degree. The first degree was taken.")
  263. print(i)
  264. }
  265. return(ypdataframe$currentdegree)
  266. }
  267. }
  268. #'@title Update/Re-Install your great jobify-Package
  269. #'@description So you don't have to copy everytime
  270. #'@param yourPath The path leading to the .gz file of the jobify-Package.
  271. #' If you are using window, be sure to have either `\\` or / for the slashes
  272. #'@export updateJobifyPackage
  273. updateJobifyPackage <- function(yourpath = "C:/Users/Frederik Gremler/Dropbox/[YP] Matching + Algorithmus/Daten/jobify package/jobify_0.1.0.tar.gz") {
  274. #yourPath to the jobify-package
  275. path <- yourpath
  276. install.packages(yourpath, repos = NULL, type = "source")
  277. }
  278. #'@title LOAD ALLE THE PACKAGES
  279. #'@description This loads a ton of a packages, I (Frederik) need - and as I don't want to copy
  280. #' code everytime I wrote this nice function. Right now it loads: purrr, dplyr, ggplot2, plotly
  281. #'@export loadAllThePackages
  282. loadAllThePackages <- function(){
  283. library(purrr)
  284. library(dplyr)
  285. library(ggplot2)
  286. library(plotly)
  287. library(jsonlite)
  288. }