diff --git "a/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.R" "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.R" new file mode 100644 index 00000000..5f006a40 --- /dev/null +++ "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.R" @@ -0,0 +1,150 @@ +#1. 假设有300 名学生,起始学号为210222001 + +xuehao <- c(210222001:210222300) +xuehao + +mySeed <- 20 + +#2. 生成各科成绩 + +set.seed(mySeed) +gaoshu <- rnorm(300,70, sqrt(10)); +gaoshu <- round(gaoshu); +gaoshu[gaoshu > 100] <- 100; +gaoshu[gaoshu < 0] <- 0; + +set.seed(mySeed) +xiandai <- rnorm(300,0, sqrt(2)); +xiandai <- round(xiandai); +xiandai <- xiandai + gaoshu; +xiandai[xiandai > 100] <- 100; +xiandai[xiandai < 0] <- 0; + +set.seed(mySeed) +yingyu <- runif(300,56,99) +yingyu <- round(yingyu) + +set.seed(mySeed); +chengshe <- rnorm(300,85,sqrt(12)); +chengshe <- round(chengshe); +chengshe[chengshe > 100] <- 100; +chengshe[chengshe < 0] <- 0; + + +#3.把上述信息组合成数据框,并写到文本文件中 + + +#生成数据框 +classscore <- data.frame(xuehao,gaoshu,xiandai,yingyu,chengshe) + +#写入文本文件 +write.table(classscore,file = 'classscore.txt') + +classscore +#4. 计算各科平均分,求出每人的平均成绩,总成绩 + +#各科的平均成绩 +subjectMean <- apply(classscore[,2:5],2,mean) +subjectMean + + +#5.计算求各种指标 + +#每人的平均成绩 +studentMean <- apply(classscore[,2:5],1,mean) +studentMean + + +#每人的最高分 +studentMax <- apply(classscore[,2:5],1,max); +studentMax + +#每人的最低分 +studentMin <- apply(classscore[,2:5],1,min); +studentMin + +#每人的总成绩 +studentSum <- apply(classscore[,2:5],1,sum) +studentSum + +#给每个人分数评级 +y <- quantile(studentSum, c(.8,.6,.4,.2)) +classscore$grade[studentSum >= y[1]] <- "A" +classscore$grade[studentSum >= y[2] & studentSum < y[1]] <- "B" +classscore$grade[studentSum >= y[3] & studentSum < y[2]] <- "C" +classscore$grade[studentSum < y[3]] <- "D" +classscore + + +#6. 求总分最高的同学的学号 +classscore$xuehao[studentSum == max(studentSum)] + +#7.绘高等数学成绩直方图、柱状图丶饼图;画高数和线代,高数和英语的散点图;画各科成绩的箱尾图 +#直方图 +hist(classscore$gaoshu,breaks = 9,main = '高等数学成绩直方图',xlab = '数学成绩') +#饼图??? + +y <- quantile(classscore$gaoshu, c(.8,.6,.4,.2)) +gaoshuGrade <- c(1:300) +gaoshuGrade[classscore$gaoshu >= y[1]] <- "A" +gaoshuGrade[classscore$gaoshu >= y[2] & classscore$gaoshu < y[1]] <- "B" +gaoshuGrade[classscore$gaoshu >= y[3] & classscore$gaoshu < y[2]] <- "C" +gaoshuGrade[classscore$gaoshu < y[3]] <- "D" +gaoshuGrade +gaoshuTable <- table(gaoshuGrade) +gaoshuTable +pie(gaoshuTable) +barplot(gaoshuTable) + +pie(classscore$gaoshu) +#柱状图??? +barplot(classscore$gaoshu) + +#画高数和线代散点图 +plot(classscore$gaoshu,classscore$xiandai,pch = 20,xlab = '高数',ylab = '线代',main = '高数和线代') + classscore$gaoshu +classscore$xiandai + +classscore$gaoshu == classscore$xiandai + +#画高数和英语散点图 +plot(classscore$gaoshu,classscore$yingyu,xlab = '高数',ylab = '英语',main = '高数和英语') + + +#画各科成绩的箱尾图 +boxplot(classscore$gaoshu,classscore$xiandai,classscore$yingyu,classscore$chengshe,main = '高数,线代,英语,程设成绩的箱尾图') + +#星象图 +stars(classscore[,2:5],full = TRUE,draw.segments = TRUE,key.loc=c(30,1.5)) + +#绘制高等数学与线性代数关系图 +fit <- lm(classscore$xiandai ~ classscore$gaoshu,data = classscore) +plot(classscore$gaoshu,classscore$xiandai,xlab = '高数',ylab = '线代',main = '高数和线代') +abline(fit) + +#install.packages("ggplot2") +library(ggplot2) +ggplot(data=classscore, aes(x=gaoshu, y=xiandai)) + + geom_point(pch=17, color="blue", size=2) + + geom_smooth(method="lm", color="red", linetype=2) + + labs(title="gaoshu&xiandai", x="gaoshu", y="xiandai") + + +#绘制高等数学与英语关系图 +library(ggplot2) +ggplot(data=classscore, aes(x=gaoshu, y=yingyu)) + + geom_point(pch=17, color="blue", size=2) + + geom_smooth(method="lm", color="red", linetype=2) + + labs(title="gaoshu&yingyu", x="gaoshu", y="yingyu") + +#10. 生成社会实践课成绩(A,B,C,D,E)并将其加入到classscore数据框 +level<- c("A","B","C","D","E") +shijian<- sample(level,300, replace = TRUE) +classscore$shijian <- shijian +classscore + +#15 +shijianTable <- table(classscore$shijian) +shijianTable +plot(shijianTable) +pie(shijianTable) diff --git "a/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.md" "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.md" new file mode 100644 index 00000000..7aed1c7c --- /dev/null +++ "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\270\200.md" @@ -0,0 +1,430 @@ +[TOC] + + + + + + + +## 涓銆佸疄楠岀洰鐨 + + + +1. 鎺屾彙R璇█鏁版嵁缁撴瀯锛 + +2. 鎺屾彙R璇█缁樺埗鐩存柟鍥俱佸瘑搴︿及璁℃洸绾裤佺粡楠屽垎甯冨浘鍜孮Q鍥剧殑鏂规硶锛 + +3. 鎺屾彙R璇█缁樺埗鑼庡彾鍥俱佺绾垮浘鐨勬柟娉曪紱 + +4. 鎺屾彙鎻忚堪鎬х粺璁″垎鏋愪腑甯哥敤鐨勭粺璁¢噺锛 + +5. 鎺屾彙R璇█绠鍗曠嚎鎬у洖褰掑垎鏋愶紱 + +6. 鎺屾彙R璇█鐨勫悇椤瑰姛鑳藉拰鍑芥暟锛岃兘澶熼氳繃瀹屾垚璇曢獙鍐呭瀵筊璇█鏈変竴瀹氱殑浜嗚В锛屼細杩愮敤杞欢瀵规暟鎹繘琛屽垎鏋愩 + + + + + +## 浜屻佸疄楠岀幆澧 + +Windows绯荤粺锛孯Gui锛32-bit锛 + + + + + +## 涓夈佸疄楠屽唴瀹 + +### 1.鐢熸垚classscore鏁版嵁妗 + +```R +xuehao <- c(210222001:210222300) +xuehao +``` + + set.seed()鍊艰缃负鑷繁鐨勪笂璇惧簭鍙枫 + +```R +mySeed = 20; +set.seed(mySeed); +``` + +**缁忔煡闃呰祫鏂欏彲鐭ワ紝鍦ㄦ瘡娆¤皟鐢╮norm()鍑芥暟涔嬪墠閮借杩涜涓娆et.seed()锛屽惁鍒檙norm()鍑芥暟灏变細鐢ㄩ殢鏈虹瀛愩** + + + +#### 鐢熸垚鍚勭鎴愮哗 + +- #### 璁鹃珮绛夋暟瀛︽垚缁╂弧瓒虫鎬佸垎甯冿紝骞冲潎鍒嗕负70鍒嗭紝鏂瑰樊涓10 + +```R +set.seed(mySeed) +gaoshu <- rnorm(300,70, sqrt(10)); +gaoshu <- round(gaoshu); +gaoshu[gaoshu > 100] <- 100; +gaoshu[gaoshu < 0] <- 0; +``` + +- #### 绾挎т唬鏁版垚缁╀负楂樼瓑鏁板鎴愮哗鍩虹涓婂姞涓婁竴涓潎鍊间负0锛屾柟宸负2鐨勬壈鍔ㄥ悗鍙栨暣鐨勬垚缁 + +```R +set.seed(mySeed) +xiandai <- rnorm(300,0, sqrt(2)); +xiandai <- round(xiandai); +xiandai <- xiandai + gaoshu; +xiandai[xiandai > 100] <- 100; +xiandai[xiandai < 0] <- 0; +``` + +- #### 鑻辫鎴愮哗婊¤冻鍧囧寑鍒嗗竷锛屾渶浣庡垎涓56鍒嗭紝鏈楂樺垎涓99鍒 + +```R +set.seed(mySeed) +yingyu <- runif(300,56,99) +yingyu <- round(yingyu) +``` + +- #### 绋嬪簭璁捐鎴愮哗婊¤冻姝f佸垎甯冿紝骞冲潎鍒85锛屾柟宸负12 + +```R +set.seed(mySeed); +chengshe <- rnorm(300,85,sqrt(12)); +chengshe <- round(chengshe); +chengshe[chengshe > 100] <- 100; +chengshe[chengshe < 0] <- 0; +``` + +- #### 鐢熸垚鏁版嵁妗嗗苟鍐欏叆鏂囨湰鏂囦欢涓 + +```R +#鐢熸垚鏁版嵁妗 +classscore <- data.frame(xuehao,gaoshu,xiandai,yingyu,chengshe) + +#鍐欏叆鏂囨湰鏂囦欢 +write.table(classscore,file = 'classscore.txt') +``` + + + +#### 鐢熸垚鏁版嵁妗 + +```R + xuehao gaoshu xiandai yingyu chengshe +1 210222001 75 77 96 90 +2 210222002 69 69 88 84 +3 210222003 69 68 73 84 +4 210222004 71 71 89 86 +5 210222005 68 67 70 83 + +..... + +295 210222295 73 74 77 88 +296 210222296 73 74 95 88 +297 210222297 69 69 58 84 +298 210222298 78 82 65 94 +299 210222299 71 71 92 86 +300 210222300 67 66 94 81 +``` + + + +#### 鍐欏叆classscore.txt鏂囦欢 + + + +![image-20211115141757702](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115141804.png) + + + + + +### 2. 璁$畻姹傚悇绉嶆寚鏍 + + + +#### 鍚勭鐨勫钩鍧囨垚缁 + +```r +> subjectMean <- apply(classscore[,2:5],2,mean) +> subjectMean + gaoshu xiandai yingyu chengshe +70.03333 70.03667 78.39000 85.04667 +``` + + + +#### 姣忎釜浜虹殑骞冲潎鎴愮哗 + +```R +> studentMean <- apply(classscore[,2:5],1,mean) +> studentMean + [1] 84.50 77.50 73.50 79.25 72.00 71.00 79.00 74.75 75.75 78.50 73.75 77.00 73.25 80.50 67.50 70.25 + [17] 80.25 79.00 78.50 73.50 74.00 76.25 74.00 65.25 71.25 73.50 75.25 73.75 73.50 85.50 75.25 77.25 + [33] 80.25 81.50 75.00 79.25 77.00 79.50 73.25 72.00 71.25 75.50 79.75 79.00 73.50 74.50 74.00 77.75 + [49] 77.00 79.75 71.25 76.50 77.00 83.50 68.00 75.75 78.75 69.75 83.50 71.00 75.50 75.50 75.25 70.50 +...... +``` + + + +#### 姣忎釜浜虹殑鏈楂樺垎 + +```R +> studentMax <- apply(classscore[,2:5],1,max); +> studentMax + [1] 96 88 84 89 83 81 88 85 89 89 87 86 90 94 80 85 93 91 89 87 86 88 84 77 86 83 83 84 88 97 85 85 + [33] 90 98 84 90 93 97 88 81 85 91 91 89 84 89 90 90 85 91 82 87 90 99 81 88 90 84 99 81 84 88 84 82 + [65] 95 79 90 82 89 83 95 90 91 97 82 79 90 89 90 86 87 89 97 96 85 83 94 79 86 89 93 88 96 83 96 93 +...... +``` + + + +#### 姣忎釜浜虹殑鏈浣庡垎 + +```R +> studentMin <- apply(classscore[,2:5],1,min); +> studentMin + [1] 75 69 68 71 67 66 73 66 67 70 63 69 61 71 62 56 71 70 73 63 64 70 69 60 57 68 67 64 59 78 70 70 + [33] 74 71 68 74 66 68 58 64 60 57 71 74 69 59 56 69 70 71 66 72 66 73 61 68 70 57 73 61 69 67 69 65 +...... +``` + + + +#### 姣忎釜浜虹殑鎬绘垚缁 + +```R +> studentSum <- apply(classscore[,2:5],1,sum) +> studentSum + [1] 338 310 294 317 288 284 316 299 303 314 295 308 293 322 270 281 321 316 314 294 296 305 296 261 + [25] 285 294 301 295 294 342 301 309 321 326 300 317 308 318 293 288 285 302 319 316 294 298 296 311 +...... +``` + + + +#### 缁欐瘡涓汉鍒嗘暟璇勭骇 + +棣栧厛璁$畻鍑烘绘垚缁╃殑4涓簲鍒嗕綅鏁帮紝鐒跺悗鏍规嵁鎬绘垚缁╄惤鍦ㄧ殑鍖洪棿灏嗗鐢熺殑鎴愮哗璇勪负A,B,C,D鍥涗釜鍖洪棿 + +```R +> y <- quantile(studentSum, c(.8,.6,.4,.2)) +> classscore$grade[studentSum >= y[1]] <- "A" +> classscore$grade[studentSum >= y[2] & studentSum < y[1]] <- "B" +> classscore$grade[studentSum >= y[3] & studentSum < y[2]] <- "C" +> classscore$grade[studentSum < y[3]] <- "D" +> classscore + xuehao gaoshu xiandai yingyu chengshe grade +1 210222001 75 77 96 90 A +2 210222002 69 69 88 84 B +3 210222003 69 68 73 84 D +4 210222004 71 71 89 86 B +5 210222005 68 67 70 83 D +6 210222006 67 66 70 81 D +7 210222007 73 74 81 88 B +8 210222008 67 66 85 81 D +9 210222009 73 74 67 89 C +10 210222010 70 70 89 85 B +...... +``` + + + +#### 姹傛诲垎鏈楂樼殑鍚屽鐨勫鍙 + +```R +> classscore$xuehao[studentSum == max(studentSum)] +[1] 210222170 +``` + +鎬诲垎鏈楂樺悓瀛︾殑瀛﹀彿锛屽嵆鍒嗘暟绛変簬鏈楂樺垎鐨勫悓瀛︾殑瀛﹀彿 + + + + + +### 3.鍥惧儚鐨勭粯鍒 + + + +#### 缁橀珮绛夋暟瀛︽垚缁╃洿鏂瑰浘 + +```R +hist(classscore$gaoshu,breaks = 9,main = '楂樼瓑鏁板鎴愮哗鐩存柟鍥',xlab = '鏁板鎴愮哗') +``` + +![image-20211115151045603](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115151045.png) + +#### 楗肩姸鍥 + +鍏堝鏁板鎴愮哗杩涜璇勭骇锛屽苟缁熻姣忎釜绾у埆鍒嗗埆鏈夊灏戜汉 + +```R +> y <- quantile(classscore$gaoshu, c(.8,.6,.4,.2)) +> gaoshuGrade <- c(1:300) +> gaoshuGrade[classscore$gaoshu >= y[1]] <- "A" +> gaoshuGrade[classscore$gaoshu >= y[2] & classscore$gaoshu < y[1]] <- "B" +> gaoshuGrade[classscore$gaoshu >= y[3] & classscore$gaoshu < y[2]] <- "C" +> gaoshuGrade[classscore$gaoshu < y[3]] <- "D" +> gaoshuGrade + [1] "A" "D" "A" "D" "C" "B" "D" "D" "C" "D" "C" "C" "D" "A" "D" "C" "A" "C" "C" "B" "B" "C" "B" "B" "C" "D" "D" "D" + [29] "D" "D" "C" "A" "A" "A" "D" "D" "D" "A" "C" "B" "D" "D" "B" "C" "D" "C" "A" "A" "A" "C" "A" "C" "B" "A" "B" "C" + [57] "A" "C" "D" "B" "C" "C" "C" "D" "A" "D" "A" "D" "A" "D" "B" "D" "A" "D" "A" "B" "C" "C" "D" "C" "C" "A" "C" "C" + [85] "B" "D" "C" "C" "A" "A" "A" "D" "B" "C" "C" "B" "A" "A" "D" "C" "B" "C" "A" "B" "A" "C" "C" "C" "B" "D" "A" "A" + +...... + +> gaoshuTable <- table(gaoshuGrade) +> gaoshuTable +gaoshuGrade + A B C D +76 63 66 95 +``` + + + +鐢诲嚭楗肩姸鍥撅細 + +```R +pie(gaoshuTable) +``` + +![image-20211116082510213](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211116082531.png) + +鍙互鐪嬪嚭锛屽洓涓瓑绾т腑锛孌绾х浉瀵硅緝澶氾紝C绾х浉瀵硅緝灏 + + + +#### 鏌辩姸鍥 + +```R +barplot(gaoshuTable) +``` + +![image-20211116082846407](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211116082846.png) + + + +#### 鐢婚珮鏁板拰绾夸唬鏁g偣鍥 + +```R +plot(classscore$gaoshu,classscore$xiandai,xlab = '楂樻暟',ylab = '绾夸唬',main = '楂樻暟鍜岀嚎浠') +``` + +![image-20211115190155514](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115190155.png) + +#### 鐢婚珮鏁板拰鑻辫鏁g偣鍥 + +```R +plot(classscore$gaoshu,classscore$yingyu,xlab = '楂樻暟',ylab = '鑻辫',main = '楂樻暟鍜岃嫳璇') +``` + +![image-20211115190944588](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115190944.png) + + + +#### 鐢诲悇绉戞垚缁╃殑绠卞熬鍥 + +```R +boxplot(classscore$gaoshu,classscore$xiandai,classscore$yingyu,classscore$chengshe,main = '楂樻暟锛岀嚎浠o紝鑻辫锛岀▼璁炬垚缁╃殑绠卞熬鍥') +``` + +![image-20211115191024537](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115191024.png) + + + +#### 鏄熻薄鍥 + +```R +stars(classscore[,2:5],full = TRUE,draw.segments = TRUE,key.loc=c(30,1.5)) +``` + +![image-20211115191138286](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115191138.png) + +鎸夌収鍙充笅瑙掔殑鍥句緥锛屾瘡涓浘鐨勭孩鑹诧紝榛戣壊锛岃摑鑹诧紝缁胯壊鍒嗗埆浠h〃绾夸唬銆侀珮鏁般佺▼璁俱佽嫳璇殑鎴愮哗锛岃壊鍧楃殑澶у皬浠h〃鎴愮哗鐨勯珮浣庛 + + + +#### 缁樺埗楂樼瓑鏁板涓庣嚎鎬т唬鏁板叧绯诲浘 + +```R +library(ggplot2) +ggplot(data=classscore, aes(x=gaoshu, y=xiandai)) + + geom_point(pch=17, color="blue", size=2) + + geom_smooth(method="lm", color="red", linetype=2) + + labs(title="gaoshu&xiandai", x="gaoshu", y="xiandai") +``` + +![image-20211115200724108](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115200724.png) + + + +#### 楂樻暟鍜岃嫳璇殑鍏崇郴鍥 + +```R +library(ggplot2) +ggplot(data=classscore, aes(x=gaoshu, y=yingyu)) + + geom_point(pch=17, color="blue", size=2) + + geom_smooth(method="lm", color="red", linetype=2) + + labs(title="gaoshu&yingyu", x="gaoshu", y="yingyu") +``` + + + +![image-20211115200832356](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115200832.png) + +#### 鐢熸垚绀句細瀹炶返璇炬垚缁╁苟杩涜缁熻 + +```R +level<- c("A","B","C","D","E") +shijian<- sample(level,300, replace = TRUE) +classscore$shijian <- shijian +``` + + + +**浣跨敤table()鍑芥暟缁熻绀句細瀹炶返璇剧▼寰楋紙A锛孊锛孋锛孌锛孍锛夌殑浜烘暟** + +```R +shijianTable <- table(classscore$shijian) +shijianTable +``` + +```R + A B C D E +57 61 64 65 53 +``` + + + +**鐢╬lot()鍑芥暟缁樺埗绀句細瀹炶返璇剧▼寰楀垎鍒嗗竷鎯呭喌銆** + +```R +plot(shijianTable) +``` + +![image-20211115203005337](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115203005.png) + + + + + +**鐢╬ie()鍑芥暟缁樺埗绀句細瀹炶返璇剧▼寰楀垎鍒嗗竷鎯呭喌銆** + +```R +pie(shijianTable) +``` + +![image-20211115203013249](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211115203013.png) + +浠庨ゼ鐘跺浘鍙互鐪嬪嚭锛岀ぞ浼氬疄璺佃涓悇涓瓑绾х殑鎴愮哗鏁伴噺澶ф鐩哥瓑銆 + + + +## 鍥.瀹為獙鎬荤粨 + +閫氳繃鏈瀹為獙锛屾垜鏇村姞鐔熺粌鐨勬帉鎻′簡R璇█鐨勬暟鎹粨鏋勶紝骞朵笖瀵瑰悇绉嶅浘鍍忕殑缁樺埗鏈変簡鏇存繁鐨勭悊瑙c傛帉鎻′簡鎻忚堪鎬х粺璁″垎鏋愪腑甯哥敤鐨勭粺璁¢噺锛屾帉鎻′簡R璇█鐨勭畝鍗曠嚎鎬у洖褰掑垎鏋愩傚苟瀵瑰父鐢ㄧ殑鍑芥暟鏈変簡鏇寸啛缁冪殑鎺屾彙銆 + +鍦ㄥ疄楠岃繃绋嬩腑锛屼篃閬囧埌浜嗕竴浜涢棶棰橈紝涓昏鐨勮В鍐虫柟寮忔槸鏌ラ槄璇炬湰鍜岃佸笀鍙戠殑PPT锛屽苟涓斿湪缃戜笂鏌ラ槄鐩稿叧璧勬枡銆 \ No newline at end of file diff --git "a/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.R" "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.R" new file mode 100644 index 00000000..4778dc3f --- /dev/null +++ "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.R" @@ -0,0 +1,35 @@ +install.packages("caret") + +data(iris) +iris + +train_list <- sample(nrow(iris), 0.7*nrow(iris)) +train <- iris[train_list,] +test <- iris[-train_list,] + +dim(iris) + +sapply(iris, class) + +head(iris) + +a <- factor(iris$Species) +levels(a) + +percentage <- prop.table(table(iris$Species)) * 100 +percentage +cbind(freq=table(iris$Species), percentage=percentage) +freq=table(iris$Species) +iris$Species + +summary(iris) + +x <- iris[,1:4] +y <- iris[,5] + +par(mfrow=c(1,4)) +for(i in 1:4) { + boxplot(x[,i], main=names(iris)[i]) +} + +plot(y) diff --git "a/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.md" "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.md" new file mode 100644 index 00000000..5670c052 --- /dev/null +++ "b/code/cse/bigdata/R language/\345\256\236\351\252\214\344\272\214.md" @@ -0,0 +1,523 @@ +[TOC] + + + +# 涓銆佸疄楠岀洰鐨 + +1. 鐞嗚В楦㈠熬鑺辨暟鎹泦鐨勭粨鏋勶紱 + +2. 鎺屾彙R璇█鍒╃敤鏈哄櫒瀛︿範绠楁硶杩涜澶勭悊鐨勬祦绋嬪拰璇硶锛 + +3. 鎺屾彙R璇█鏁版嵁鍙鍖栫殑鍩烘湰鏂规硶锛 + +4. 鎺屾彙鏈哄櫒瀛︿範妯″瀷鐨勫缓绔嬩笌妯″瀷閫夋嫨鐨勬柟娉曪紱 + +5. 鎺屾彙甯哥敤鐨勬満鍣ㄥ涔犵畻娉曞湪楦㈠熬鑺辨暟鎹泦涓婄殑搴旂敤銆 + + + + + +# 浜屻佸疄楠岀幆澧 + +Windows绯荤粺锛孯Gui锛32-bit锛 + + + + + +# 涓夈佸疄楠屽唴瀹 + +楦㈠熬鑺憋紙Iris锛夋暟鎹泦鏄父鐢ㄧ殑鍒嗙被瀹為獙鏁版嵁闆嗭紝鐢盕isher, 1936鏀堕泦鏁寸悊銆傛暟鎹泦鍖呭惈150涓暟鎹紝鍒嗕负3绫伙紝姣忕被50涓暟鎹紝姣忎釜鏁版嵁鍖呭惈4涓睘鎬с傚彲閫氳繃鑺辫惣闀垮害锛岃姳钀煎搴︼紝鑺辩摚闀垮害锛岃姳鐡e搴4涓睘鎬ч娴嬮涪灏捐姳鍗夊睘浜庯紙Setosa锛孷ersicolour锛孷irginica锛変笁涓绫讳腑鐨勫摢涓绫汇傛湰瀹為獙瀵归涪灏捐姳鏁版嵁鍚勪釜鐗瑰緛鐨勭浉鍏虫ц繘琛屽垎鏋愶紝鎺ヤ笅鏉ュ疄楠屽嚑绉嶅父鐢ㄧ殑鏈哄櫒瀛︿範绠楁硶瀵硅鏁版嵁杩涜鍒嗙被鐨勬晥鏋滐紝鏈鍚庨夊嚭鍒嗙被鏁堟灉杈冨ソ鐨勬柟娉曘 + + + + + +# 鍥涖佸疄楠屾楠ょ粨鏋滃強鍒嗘瀽 + +## 4.1. 瀹夎caret鍖 + +鍔犺浇caret鍖: + +```R +install.packages("caret") +library(caret) +``` + + + +## 4.2.鏌ョ湅楦㈠熬鑺辨暟鎹 + + + + + +![image-20211127225742107](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127225749.png) + +鈥 ![image-20211127225929682](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127225929.png) + +## 4.3.鏁版嵁闆嗗垎鎴愪袱閮ㄥ垎 + +鍏朵腑80锛呭皢鐢ㄤ簬璁粌鎴戜滑鐨勬ā鍨嬶紝20锛呮垜浠皢浣滀负楠岃瘉鏁版嵁闆嗐 + +**鐢熸垚楠岃瘉鏁版嵁闆** + +```r +train_list <- sample(nrow(iris), 0.7*nrow(iris)) +test <- iris[-train_list,] +``` + +**鐢熸垚璁粌鏁版嵁闆** + +``` +train <- iris[train_list,] +``` + + + +## 4.4.鏌ョ湅鏁版嵁闆嗙殑灞炴 + +### 4.4.1鏁版嵁闆嗙殑缁村害(鐢╠im鍑芥暟)銆 + +``` +> dim(iris) +[1] 150 5 +``` + + 150涓暟鎹紝5涓淮搴 + + + +### 4.4.2灞炴х殑绫诲瀷銆 + +``` +sapply(iris, class) + +Sepal.Length Sepal.Width Petal.Length Petal.Width Species + +"numeric" "numeric" "numeric" "numeric" "factor" +``` + + 鏁版嵁鐨5涓淮搴︾殑灞炴у垎鍒槸锛氭暟瀛楋紝鏁板瓧锛屾暟瀛楋紝鏁板瓧锛屽洜瀛 + + + +### 4.4.3绐ヨ鏁版嵁鏈韩(head鍑芥暟)銆 + +``` + head(iris) +``` + + + +![image-20211127230019798](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127230019.png) + +### 4.4.4楦㈠熬鑺变腑class灞炴х殑绾у埆(levels()鍑芥暟)銆 + + + +``` +> a <- factor(iris$Species) +> levels(a) +[1] "setosa" "versicolor" "virginica" +``` + + 绾у埆鏈変笁涓紝鍒嗗埆鏄"setosa" "versicolor" "virginica" + + + +### 4.4.5楦㈠熬鑺辩被鍒崰姣旀儏鍐 + +**瑙i噴璇ヨ浣滅敤** + +``` +percentage <- prop.table(table(dataset$Species)) * 100 +``` + + + +> 姹傚嚭iris鏁版嵁闆嗙殑Species灞炴т腑姣忕鍙栧肩殑鎵鍗犵殑鐧惧垎姣旓紝璧嬩簣percetage鍙橀噺銆 + +**瑙i噴璇ヨ浣滅敤** + +cbind(freq=table(dataset$Species), percentage=percentage) + +> 灏唅ris鐨凷pecies灞炴ф瘡绉嶅彇鍊肩殑棰戞暟鍜屾墍鍗犵殑鐧惧垎姣旂殑琛ㄦ牸鎸夌収鍒楄繘琛屾嫾鎺ャ + +4.6鎵鏈夊睘鎬х殑缁熻鎽樿(鐢╯ummary鍑芥暟) + +``` +summary(iris) +``` + + + +![image-20211127230029422](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127230029.png) + + + +> 鑾峰彇姣忎釜灞炴х殑鏈灏忓硷紝鍥涘垎浣嶆暟锛屾渶澶у笺 +> + + + +## 4.5.鏁版嵁鍙鍖 + +### 4.5.1 鍗曞彉閲忓浘 + +**瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢** + +```r +# x input,y output + +x <- dataset[,1:4] + +y <- dataset[,5] +``` + + + +灏唅ris鏁版嵁闆嗙殑鍓嶅洓鍒楁彁鍙栧嚭鏉ヨ祴缁檟 + +灏唅ris鏁版嵁闆嗙殑绗簲鍒楁彁鍙栧嚭鏉ヨ祴缁檡 + + ```r + par(mfrow=c(1,4)) + + for(i in 1:4) { + + boxplot(x[,i], main=names(iris)[i]) + + } + ``` + + + +par()鍑芥暟灏嗙敾鍥惧尯鍩熷垎鍓蹭负4涓紱 + +鍦ㄦ瘡涓垎鍖洪噷鍒嗗埆鐢籭ris鏁版嵁闆嗘瘡涓鍒楃殑绠辩嚎鍥 + + + +![image-20211127230053713](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127230053.png) + +```R +plot(y) +``` + + + +灏唝涓瘡涓肩殑棰戞暟鐢绘垚鏌辩姸鍥 + +![image-20211127230100510](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127230100.png) + + + +### 4.5.2 澶氬彉閲忓浘 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢ㄥ苟鍥炵瓟涓嬮潰鐨勯棶棰 + +```R +# scatterplot matrix + +#featurePlot(x=x, y=y, plot="ellipse") + +#featurePlot(x=x, y=y, plot="box") + + +featurePlot(x=x[y\=="setosa",1:2], y=y[y=="setosa"], plot="ellipse") + +featurePlot(x=x[y\=="versicolor",1:2], y=y[y=="versicolor"], plot="ellipse") + +featurePlot(x=x[y\=="virginica",1:2], y=y[y=="virginica"], plot="ellipse") +``` + + + +鍒嗗埆鐢诲嚭鍝佺涓簊etosa銆乿ersicolor銆乿irginica鐨勯涪灏捐姳鐨勮姳钀奸暱搴︿笌鑺辫惣瀹藉害鐨勫叧绯诲浘锛岃鏄庝笁绉嶈姳鐨勮姳钀奸暱搴﹀拰瀹藉害閮藉ぇ鑷存垚姝g浉鍏冲叧绯 + + ```r + featurePlot(x=x[,1:2], y=y, plot="density") + ``` + +![image-20211127230111781](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211127230111.png) + +鐢诲嚭涓夌鑺辩殑鑺辫惣闀垮害銆佽姳钀煎搴︾殑鍒嗗竷鍥俱傚彲浠ョ湅鍑轰笁绉嶈姳鐨勮姳钀奸暱搴﹀垎甯冨樊鍒緝涓烘槑鏄撅紝鑺辫惣瀹藉害鍒嗗埆宸埆涓嶅お鏄庢樉銆 + + + +**鎬濊冿紵 濡備綍鐭ラ亾浠涔堢嚎鏉¤〃绀虹殑鏄偅绉嶇被鍨嬶紵钃濊壊琛ㄧず鈥渟etosa鈥濓紱绾㈣壊琛ㄧず鈥渧ersicolor鈥濓紱缁胯壊琛ㄧず鈥渧irginica鈥** + + + +绛旓細 + +鍙互鍒嗗埆杩愯濡備笅鎸囦护锛 + +```r +featurePlot(x=x[y\=="setosa",1:2], y=y[y=="setosa"], plot="density") + +featurePlot(x=x[y\=="versicolor",1:2], y=y[y=="versicolor"], plot="density") + +featurePlot(x=x[y\=="virginica",1:2], y=y[y=="virginica"], plot="density") +``` + + + +鍒嗗埆鐢诲嚭setosa銆乿ersicolor銆乿irginica绾挎潯锛屽彂鐜颁笁鏉$嚎鏉$殑棰滆壊鍒嗗埆鏄摑鑹层佺矇鑹层佺豢鑹诧紝璇存槑钃濊壊琛ㄧず鈥渟etosa鈥濓紱绾㈣壊琛ㄧず鈥渧ersicolor鈥濓紱缁胯壊琛ㄧず鈥渧irginica鈥濄 + + + + + +**鍚勪釜鍥剧墖妯旱鍧愭爣鐨勫惈涔夋槸浠涔?** + +> 鍦ㄦき鍦嗗浘涓奡epal.length鏄涓鍒楃殑妯潗鏍囧拰绗洓琛岀殑绾靛潗鏍 +> +> 鍦ㄥ瘑搴﹀浘涓婃槸Sepal.Length鍜孲epal.Width鐨勫 +> + + + +## 4.6.绠楁硶璇勪及 + +浠ヤ笅鏄垜浠皢鍦ㄦ姝ラ涓兜鐩栫殑鍐呭锛 + +1.璁剧疆娴嬭瘯宸ュ叿浠ヤ娇鐢10鍊嶄氦鍙夐獙璇併 + +2.寤虹珛5绉嶄笉鍚岀殑妯″瀷, 渚濇嵁楦㈠熬鑺辫姳钀煎拰鑺辩摚鐨勬祴閲忕粨鏋滃鍏舵墍灞炵绉嶇被杩涜棰勬祴. + +3.閫夋嫨鏈浣崇殑妯″瀷銆 + + + +### 4.6.1 10鍊嶄氦鍙夐獙璇 + +鐢10鍊嶄氦鍙夐獙璇佹潵浼拌鍑嗙‘鎬с + +灏嗘暟鎹泦鍒嗘垚10涓儴鍒嗭紝9涓缁冨拰1涓祴璇曪紝鎵鏈夊彲鑳界粍鍚堥兘鐢ㄦ潵璁粌妯″瀷銆 + +```R +control <- trainControl(method="cv", number=10) + +metric <- "Accuracy" +``` + + + +### 4.6.2 寤虹珛妯″瀷 + +鎴戜滑涓嶇煡閬撳摢涓畻娉曞彲浠ヨВ鍐虫闂鎴栦娇鐢ㄥ摢浜涜缃傛垜浠粠涓婇潰鍥句腑鏌愪簺绫诲湪鏌愪簺缁村害涓婂彲浠ラ儴鍒嗙嚎鎬у垎绂伙紝鍥犳鎴戜滑鏈熸湜寰楀埌鏅亶鑹ソ鐨勭粨鏋溿 + +鎴戜滑鏉ヨ瘎浼5绉嶄笉鍚岀殑绠楁硶锛 + +绾挎у垽鍒垎鏋愶紙LDA锛 + +鍒嗙被鍜屽洖褰掓爲锛圕ART锛夈 + +k-鏈杩戦偦灞咃紙kNN锛夈 + +甯︽湁绾挎у唴鏍哥殑鏀寔鍚戦噺鏈猴紙SVM锛夈 + +闅忔満妫灄锛圧F锛 + + + + + +1) #### 绾挎у垽鍒垎鏋愶紙LDA锛 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢 + +```r +set.seed(7) + +fit.lda <- train(Species~., data=dataset, method="lda", metric=metric, trControl=control) +``` + +> 璁剧疆闅忔満绉嶅瓙涓7 +> +> 鐢╨da鏂规硶鏉ヨ缁冩ā鍨嬶紝閫氳繃iris鏁版嵁闆嗙殑鍓嶅洓鍒楁潵棰勬祴Species灞炴с +> + + + +2. #### 鍒嗙被鍜屽洖褰掓爲锛圕ART锛 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢 + +```R +set.seed(7) + +fit.cart <- train(Species~., data=dataset, method="rpart", metric=metric, trControl=control) +``` + +> 璁剧疆闅忔満绉嶅瓙涓7 +> +> 鐢ㄥ垎绫诲拰鍥炲綊鏍戞柟娉曟潵璁粌妯″瀷锛岄氳繃iris鏁版嵁闆嗙殑鍓嶅洓鍒楁潵棰勬祴Species灞炴с +> + + + + + +3) #### k-鏈杩戦偦灞咃紙kNN锛 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢 + +```r +set.seed(7) + +fit.knn <- train(Species~., data=dataset, method="knn", metric=metric, trControl=control) +``` + +> 璁剧疆闅忔満绉嶅瓙涓7 +> +> 鐢 k-鏈杩戦偦灞呮柟娉曟潵璁粌妯″瀷锛岄氳繃iris鏁版嵁闆嗙殑鍓嶅洓鍒楁潵棰勬祴Species灞炴с +> + + + +4) #### 甯︽湁绾挎у唴鏍哥殑鏀寔鍚戦噺鏈猴紙SVM锛 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢 + +```R +set.seed(7) + +fit.svm <- train(Species~., data=dataset, method="svmRadial", metric=metric, trControl=control) +``` + +> 璁剧疆闅忔満绉嶅瓙涓7 +> +> 鐢ㄥ甫鏈夌嚎鎬у唴鏍哥殑鏀寔鍚戦噺鏈烘柟娉曟潵璁粌妯″瀷锛岄氳繃iris鏁版嵁闆嗙殑鍓嶅洓鍒楁潵棰勬祴Species灞炴с +> + + + + + +5) #### 闅忔満妫灄锛圧F锛 + +瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢 + + ```R + set.seed(7) + + fit.rf <- train(Species~., data=dataset, method="rf", metric=metric, trControl=control) + ``` + +> 璁剧疆闅忔満绉嶅瓙涓7 +> +> 鐢ㄩ殢鏈烘.鏋楁柟娉曟潵璁粌妯″瀷锛岄氳繃iris鏁版嵁闆嗙殑鍓嶅洓鍒楁潵棰勬祴Species灞炴с +> + + + +### 4.6.3閫夋嫨鏈浣虫ā鍨 + +鐜板湪姣忎釜閮芥湁5涓ā鍨嬪拰绮惧害浼扮畻銆傛垜浠渶瑕佸皢妯″瀷鐩镐簰姣旇緝骞堕夋嫨鏈鍑嗙‘鐨勬ā鍨嬨 + +鍙互閫氳繃棣栧厛鍒涘缓鎵鍒涘缓妯″瀷鐨勫垪琛ㄥ苟浣跨敤summary() 鍑芥暟鏄剧ず姣忎釜妯″瀷鐨勫噯纭с + +**瑙i噴涓嬮潰姣忔潯璇彞鐨勪綔鐢** + +```R +results <- resamples(list(lda=fit.lda, cart=fit.cart, knn=fit.knn, svm=fit.svm, rf=fit.rf)) + +summary(results) +``` + +> 鍒涘缓涓涓眹鎬昏〃锛 +> +> 璋冪敤summary()鍑芥暟锛屽苟浼犲叆resamples()鍑芥暟鍊笺傚畠浼氬垱寤轰竴涓〃鏍硷紝姣忚鏄竴绉嶇畻娉曪紝姣忓垪鏄瘎浼版寚鏍囥 + + + +灏嗘ā鍨嬬殑鍑嗙‘鐜囩敾鍥炬樉绀 + +```R +dotplot(results) +``` + +\#瀵圭粨鏋滆繘琛岃鏄庡垎鏋 + + ![image-20211129182934357](https://raw.githubusercontent.com/zhangchenqi123/imgCloud/main/img/20211129182934.png) + + + +鐢卞浘涓彲鐭ワ紝棰勬祴鍑嗙‘搴﹀拰璁粌缁撴灉鐨勫ソ鍧忥紙Kappa鍊)浠庡ぇ鍒板皬鍒嗗埆鏄細 `lda ` , `knn` , `rf` , `svm` , `cart` 銆 + +妯″瀷鐨勭疆淇″害涓0.95銆 + +鍙互鐪嬪埌鍦ㄨ繖绉嶆儏鍐典笅鏈鍑嗙‘鐨勬ā鍨嬫槸LDA锛 + +鏌ョ湅LDA妯″瀷鐨勬儏鍐: + +```R +> print(fit.lda) +Linear Discriminant Analysis + +150 samples + 4 predictor + 3 classes: 'setosa', 'versicolor', 'virginica' + +No pre-processing +Resampling: Cross-Validated (10 fold) +Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... +Resampling results: + + Accuracy Kappa + 0.98 0.97 +``` + +lda棰勬祴鐨勫噯纭害涓0.98锛宬appa妫楠屽间负0.97锛屾帴杩1锛岃鏄庤缁冩晥鏋滃緢濂姐 + + + +## 4.7.鍋氬嚭棰勬祴 + +鐩存帴鍦ㄩ獙璇侀泦涓婅繍琛孡DA妯″瀷锛屽苟鍦ㄦ贩娣嗙煩闃典腑鎬荤粨缁撴灉銆 + + ```R + predictions <- predict(fit.lda, validation) + + confusionMatrix(predictions, validation$Species) + ``` + + + + 缁撴灉濡備笅锛 + +```R + Class: setosa Class: versicolor Class: virginica +Sensitivity 1.0000 0.9600 0.9800 +Specificity 1.0000 0.9900 0.9800 +Pos Pred Value 1.0000 0.9796 0.9608 +Neg Pred Value 1.0000 0.9802 0.9899 +Prevalence 0.3333 0.3333 0.3333 +Detection Rate 0.3333 0.3200 0.3267 +Detection Prevalence 0.3333 0.3267 0.3400 +Balanced Accuracy 1.0000 0.9750 0.9800 +``` + + + +**瀵瑰悇绉嶅弬鏁拌繘琛岃В閲**锛 + +| 鍙傛暟 | 瑙i噴 | +| -------------------- | -------------------------------------- | +| Sensitivity | 棰勬祴涓烘瀹為檯鍗犳鐨勬瘮渚 | +| Specificity | 棰勪及涓鸿礋瀹為檯涓鸿礋鐨勬瘮渚 | +| Pos Pred Value | 鐪熼槼鐜囷紝瀹為檯涓烘鍗犻娴嬩负姝g殑姣斾緥 | +| Neg Pred Value | 鐪熼槾鐜囷紝瀹為檯涓鸿礋鍗犻娴嬩负璐熺殑姣斾緥 | +| Prevalence | 棰勬祴缁撴灉涓湡瀹炴鏍锋湰鍗犳绘牱鏈殑姣斾緥 | +| Detection Rate | 妫鍑虹巼锛岄娴嬩负姝f鏍锋湰鍗犳墍鏈夋牱鏈殑姣旂巼 | +| Detection Prevalence | 棰勬祴涓烘鍗犳绘牱鏈殑姣旂巼 | +| Balanced Accuracy | (鐪熼槼鐜+鐪熼槾鐜)/2 | + + + +浠庢贩娣嗙煩闃典腑鍙互鐪嬪嚭锛岄娴嬪噯纭巼閮借兘杈惧埌97.5%浠ヤ笂锛岃鏄庨娴嬬粨鏋滈潪甯稿ソ銆 +