Rev note: Some (possibly) useful commands and experiments done in class are added.

Data management:

Remove id data from 2017 data and output as a csv file. In addition, I also randomly reorder the rows for anonymity. (R codes FYR/For Your Reference)

#t<-matrix(scan("midstat2017m.csv"),ncol=3,byrow=T)
## The first column is id and intentionally left out
#exam16 <-data.frame(year=as.factor(t[,2]),mid=t[,3])
#exam16<-exam16[sample(nrow(exam16)),]
## Two different csv outputs
#write.csv(exam16, file = "stat17noid2.csv")
#write.table(exam16, file = "stat17noid.csv",row.names=FALSE, na="",col.names=FALSE, sep=" ")

Import stat 2017 data

library(data.table)
data.table 1.10.4
  The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
  Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
  Release notes, videos and slides: http://r-datatable.com
#stat17<-fread('stat17noid.csv')
stat17<-fread('http://faculty.ndhu.edu.tw/~chtsao/ftp/stat17noid.csv')
trying URL 'http://faculty.ndhu.edu.tw/~chtsao/ftp/stat17noid.csv'
Content type 'text/plain' length 438 bytes
==================================================
downloaded 438 bytes
colnames(stat17)<-c("year","mid")
stat17$year<-as.factor(stat17$year)
summary(stat17)
 year        mid        
 2:47   Min.   :-50.00  
 3:10   1st Qu.: 15.00  
 4: 5   Median : 30.00  
 7: 1   Mean   : 39.75  
        3rd Qu.: 58.00  
        Max.   :108.00  

Data cleansing using subset command

stat17[which(mid < 0)]
stat17<-subset(stat17,mid >= 0)
summary(stat17)
 year        mid        
 2:47   Min.   :  4.00  
 3:10   1st Qu.: 15.00  
 4: 5   Median : 30.50  
 7: 0   Mean   : 41.19  
        3rd Qu.: 58.00  
        Max.   :108.00  
# You may also view the whole dataframe in the Environment pane ~ View(stat17)

Import stat 2016 data

stat16 <- fread('http://faculty.ndhu.edu.tw/~chtsao/ftp/stat2016.txt')
trying URL 'http://faculty.ndhu.edu.tw/~chtsao/ftp/stat2016.txt'
Content type 'text/plain' length 558 bytes
==================================================
downloaded 558 bytes
colnames(stat16)<-c("year","mid","final")
stat16$year<-as.factor(stat16$year)
head(stat16)   # Take a quick look of first few cases
summary(stat16)
 year        mid             final       
 2:42   Min.   :  0.00   Min.   :-10.00  
 3:15   1st Qu.: 25.50   1st Qu.: 13.50  
 4: 3   Median : 46.00   Median : 31.00  
 5: 3   Mean   : 46.52   Mean   : 34.11  
        3rd Qu.: 67.50   3rd Qu.: 53.00  
        Max.   :110.00   Max.   :100.00  

Now we have two dataframes, stat2016(year, mid, final), stat2017(year,mid).

Where are we heading

Where are we now? What do we know? What do we want to know (but unknown now)?

Walk Prog before you run. Think before you prog.

Some handy functions/commands for exploratory data analysis and data cleansing

suppressMessages(library(dplyr)) # load package dplyr but suppress its messages
stat16.23<-filter(stat16, year == 2 | year == 3)
stat16.2<-filter(stat16,year==2 )
stat16.3<-filter(stat16,year==3)
summary(stat16.23)
 year        mid             final       
 2:42   Min.   :  0.00   Min.   :-10.00  
 3:15   1st Qu.: 25.00   1st Qu.: 14.00  
 4: 0   Median : 44.00   Median : 33.00  
 5: 0   Mean   : 46.07   Mean   : 34.63  
        3rd Qu.: 70.00   3rd Qu.: 53.00  
        Max.   :110.00   Max.   :100.00  
library(ggplot2)
Stackoverflow is a great place to get help:
http://stackoverflow.com/tags/ggplot2.
scatter <- ggplot(data=stat16.23, aes(x = mid, y = final)) 
scatter + geom_point(aes(color=year, shape=year)) +
  xlab("midterm") +  ylab("final") +
  ggtitle("Midterm vs Final Plot (Stat16.23)")

More

smooth <- ggplot(data=stat16.23, aes(x=mid, y=final, color=year)) + 
  geom_point(aes(shape=year), size=1.5) + xlab("mid") + ylab("final") + 
  ggtitle("Scatterplot with smoothers")
# Linear model
smooth + geom_smooth(method="lm")

#Double check with console output
plot(final~mid, data=stat16.2)
m16.2<-lm(final~mid,data=stat16.2); summary(m16.2);

Call:
lm(formula = final ~ mid, data = stat16.2)

Residuals:
    Min      1Q  Median      3Q     Max 
-31.051 -14.020  -4.080   7.962  55.367 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   9.5001     6.2849   1.512    0.139    
mid           0.6388     0.1202   5.315 4.32e-06 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 19.65 on 40 degrees of freedom
Multiple R-squared:  0.4139,    Adjusted R-squared:  0.3993 
F-statistic: 28.25 on 1 and 40 DF,  p-value: 4.317e-06
abline(m16.2)

stat17.23<-filter(stat17, year == 2 | year == 3)
boxplot(mid~year, data=stat17.23)
stat17.2<-filter(stat17,year==2 )
stat17.3<-filter(stat17,year==3)
summary(stat17.2);summary(stat17.3)
 year        mid        
 2:47   Min.   :  4.00  
 3: 0   1st Qu.: 14.00  
 4: 0   Median : 31.00  
 7: 0   Mean   : 39.79  
        3rd Qu.: 58.00  
        Max.   :108.00  
 year        mid        
 2: 0   Min.   : 20.00  
 3:10   1st Qu.: 23.00  
 4: 0   Median : 34.50  
 7: 0   Mean   : 48.90  
        3rd Qu.: 74.75  
        Max.   :105.00  
par(mfrow=c(1,2));

hist(stat17.3$mid);hist(stat17.2$mid)

smid17.2<-sort(stat17.2$mid, decreasing=TRUE)
ep<-rank(smid17.2)/47
summary(smid17.2);sd(smid17.2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   4.00   14.00   31.00   39.79   58.00  108.00 
[1] 31.68317
head(smid17.2)
[1] 108 105 100  95  95  95
smid17.2[27]
[1] 27
qnorm.ep<-qnorm(ep,39.79,31.68)
smid17.2
 [1] 108 105 100  95  95  95  88  80  78  60  60  58  58  55  48  48  45  45  44  44
[21]  44  40  32  31  28  27  27  24  21  20  16  15  15  15  15  13  10  10  10   8
[41]   7   6   6   6   6   5   4
plot(smid17.2~qnorm.ep)

LS0tCnRpdGxlOiAiV29yayBvbiBTdGF0IDIwMTYvMjAxNyBleGFtIGRhdGEgKHJldikiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KUmV2IG5vdGU6IFNvbWUgKHBvc3NpYmx5KSB1c2VmdWwgY29tbWFuZHMgYW5kIGV4cGVyaW1lbnRzIGRvbmUgaW4gY2xhc3MgYXJlIGFkZGVkLiAKCiMjIERhdGEgbWFuYWdlbWVudDogCgpSZW1vdmUgaWQgZGF0YSBmcm9tIDIwMTcgZGF0YSBhbmQgb3V0cHV0IGFzIGEgY3N2IGZpbGUuIEluIGFkZGl0aW9uLApJIGFsc28gcmFuZG9tbHkgcmVvcmRlciB0aGUgcm93cyBmb3IgYW5vbnltaXR5LiAoUiBjb2RlcyBGWVIvRm9yIFlvdXIgUmVmZXJlbmNlKQpgYGB7ciBSZW1vdmUgaWR9CiN0PC1tYXRyaXgoc2NhbigibWlkc3RhdDIwMTdtLmNzdiIpLG5jb2w9MyxieXJvdz1UKQojIyBUaGUgZmlyc3QgY29sdW1uIGlzIGlkIGFuZCBpbnRlbnRpb25hbGx5IGxlZnQgb3V0CiNleGFtMTYgPC1kYXRhLmZyYW1lKHllYXI9YXMuZmFjdG9yKHRbLDJdKSxtaWQ9dFssM10pCiNleGFtMTY8LWV4YW0xNltzYW1wbGUobnJvdyhleGFtMTYpKSxdCiMjIFR3byBkaWZmZXJlbnQgY3N2IG91dHB1dHMKI3dyaXRlLmNzdihleGFtMTYsIGZpbGUgPSAic3RhdDE3bm9pZDIuY3N2IikKI3dyaXRlLnRhYmxlKGV4YW0xNiwgZmlsZSA9ICJzdGF0MTdub2lkLmNzdiIscm93Lm5hbWVzPUZBTFNFLCBuYT0iIixjb2wubmFtZXM9RkFMU0UsIHNlcD0iICIpCmBgYApJbXBvcnQgc3RhdCAyMDE3IGRhdGEgCmBgYHtyfQpsaWJyYXJ5KGRhdGEudGFibGUpCiNzdGF0MTc8LWZyZWFkKCdzdGF0MTdub2lkLmNzdicpCnN0YXQxNzwtZnJlYWQoJ2h0dHA6Ly9mYWN1bHR5Lm5kaHUuZWR1LnR3L35jaHRzYW8vZnRwL3N0YXQxN25vaWQuY3N2JykKY29sbmFtZXMoc3RhdDE3KTwtYygieWVhciIsIm1pZCIpCnN0YXQxNyR5ZWFyPC1hcy5mYWN0b3Ioc3RhdDE3JHllYXIpCnN1bW1hcnkoc3RhdDE3KQpgYGAKRGF0YSBjbGVhbnNpbmcgdXNpbmcgc3Vic2V0IGNvbW1hbmQKYGBge3J9CnN0YXQxN1t3aGljaChtaWQgPCAwKV0Kc3RhdDE3PC1zdWJzZXQoc3RhdDE3LG1pZCA+PSAwKQpzdW1tYXJ5KHN0YXQxNykKIyBZb3UgbWF5IGFsc28gdmlldyB0aGUgd2hvbGUgZGF0YWZyYW1lIGluIHRoZSBFbnZpcm9ubWVudCBwYW5lIH4gVmlldyhzdGF0MTcpCmBgYApJbXBvcnQgc3RhdCAyMDE2IGRhdGEgCmBgYHtyfQpzdGF0MTYgPC0gZnJlYWQoJ2h0dHA6Ly9mYWN1bHR5Lm5kaHUuZWR1LnR3L35jaHRzYW8vZnRwL3N0YXQyMDE2LnR4dCcpCmNvbG5hbWVzKHN0YXQxNik8LWMoInllYXIiLCJtaWQiLCJmaW5hbCIpCnN0YXQxNiR5ZWFyPC1hcy5mYWN0b3Ioc3RhdDE2JHllYXIpCmhlYWQoc3RhdDE2KSAgICMgVGFrZSBhIHF1aWNrIGxvb2sgb2YgZmlyc3QgZmV3IGNhc2VzCnN1bW1hcnkoc3RhdDE2KQpgYGAKTm93IHdlIGhhdmUgdHdvIGRhdGFmcmFtZXMsIHN0YXQyMDE2KHllYXIsIG1pZCwgZmluYWwpLCBzdGF0MjAxNyh5ZWFyLG1pZCkuIAoKIyNXaGVyZSBhcmUgd2UgaGVhZGluZwpXaGVyZSBhcmUgd2Ugbm93PyBXaGF0IGRvIHdlIGtub3c/IFdoYXQgZG8gd2Ugd2FudCB0byBrbm93IChidXQgdW5rbm93biBub3cpPwoKPiB+fldhbGt+fiBQcm9nIGJlZm9yZSB5b3UgcnVuLiBUaGluayBiZWZvcmUgeW91IHByb2cuCgpTb21lIGhhbmR5IGZ1bmN0aW9ucy9jb21tYW5kcyBmb3IgZXhwbG9yYXRvcnkgZGF0YSBhbmFseXNpcyBhbmQgZGF0YSBjbGVhbnNpbmcKYGBge3J9CnN1cHByZXNzTWVzc2FnZXMobGlicmFyeShkcGx5cikpICMgbG9hZCBwYWNrYWdlIGRwbHlyIGJ1dCBzdXBwcmVzcyBpdHMgbWVzc2FnZXMKc3RhdDE2LjIzPC1maWx0ZXIoc3RhdDE2LCB5ZWFyID09IDIgfCB5ZWFyID09IDMpCnN0YXQxNi4yPC1maWx0ZXIoc3RhdDE2LHllYXI9PTIgKQpzdGF0MTYuMzwtZmlsdGVyKHN0YXQxNix5ZWFyPT0zKQpzdW1tYXJ5KHN0YXQxNi4yMykKYGBgCmBgYHtyfQpsaWJyYXJ5KGdncGxvdDIpCnNjYXR0ZXIgPC0gZ2dwbG90KGRhdGE9c3RhdDE2LjIzLCBhZXMoeCA9IG1pZCwgeSA9IGZpbmFsKSkgCnNjYXR0ZXIgKyBnZW9tX3BvaW50KGFlcyhjb2xvcj15ZWFyLCBzaGFwZT15ZWFyKSkgKwogIHhsYWIoIm1pZHRlcm0iKSArICB5bGFiKCJmaW5hbCIpICsKICBnZ3RpdGxlKCJNaWR0ZXJtIHZzIEZpbmFsIFBsb3QgKFN0YXQxNi4yMykiKQpgYGAKTW9yZSAKYGBge3J9CnNtb290aCA8LSBnZ3Bsb3QoZGF0YT1zdGF0MTYuMjMsIGFlcyh4PW1pZCwgeT1maW5hbCwgY29sb3I9eWVhcikpICsgCiAgZ2VvbV9wb2ludChhZXMoc2hhcGU9eWVhciksIHNpemU9MS41KSArIHhsYWIoIm1pZCIpICsgeWxhYigiZmluYWwiKSArIAogIGdndGl0bGUoIlNjYXR0ZXJwbG90IHdpdGggc21vb3RoZXJzIikKCiMgTGluZWFyIG1vZGVsCnNtb290aCArIGdlb21fc21vb3RoKG1ldGhvZD0ibG0iKQoKI0RvdWJsZSBjaGVjayB3aXRoIGNvbnNvbGUgb3V0cHV0CnBsb3QoZmluYWx+bWlkLCBkYXRhPXN0YXQxNi4yKQptMTYuMjwtbG0oZmluYWx+bWlkLGRhdGE9c3RhdDE2LjIpOyBzdW1tYXJ5KG0xNi4yKTsKYWJsaW5lKG0xNi4yKQoKYGBgCgoKYGBge3J9CnN0YXQxNy4yMzwtZmlsdGVyKHN0YXQxNywgeWVhciA9PSAyIHwgeWVhciA9PSAzKQpib3hwbG90KG1pZH55ZWFyLCBkYXRhPXN0YXQxNy4yMykKc3RhdDE3LjI8LWZpbHRlcihzdGF0MTcseWVhcj09MiApCnN0YXQxNy4zPC1maWx0ZXIoc3RhdDE3LHllYXI9PTMpCnN1bW1hcnkoc3RhdDE3LjIpO3N1bW1hcnkoc3RhdDE3LjMpCnBhcihtZnJvdz1jKDEsMikpOwpoaXN0KHN0YXQxNy4zJG1pZCk7aGlzdChzdGF0MTcuMiRtaWQpCmBgYAoKYGBge3J9CnNtaWQxNy4yPC1zb3J0KHN0YXQxNy4yJG1pZCwgZGVjcmVhc2luZz1UUlVFKQplcDwtcmFuayhzbWlkMTcuMikvNDcKc3VtbWFyeShzbWlkMTcuMik7c2Qoc21pZDE3LjIpCmhlYWQoc21pZDE3LjIpCgpzbWlkMTcuMlsyN10KcW5vcm0uZXA8LXFub3JtKGVwLDM5Ljc5LDMxLjY4KQpzbWlkMTcuMgpwbG90KHNtaWQxNy4yfnFub3JtLmVwKQpgYGAKCiMjIyBMaW5rcyBhbmQgUmVmZXJlbmVzOgoKICAtIFtRdWljay1SOiBEYXRhIG1hbmFnZW1lbnRdKGh0dHA6Ly93d3cuc3RhdG1ldGhvZHMubmV0L21hbmFnZW1lbnQvaW5kZXguaHRtbCkKICAtIFtEYXRhIHZpc3VhbGl6YXRpb24gd2l0aCBnZ3Bsb3QyXShodHRwczovL3d3dy5tYWlsbWFuLmNvbHVtYmlhLmVkdS9zaXRlcy9kZWZhdWx0L2ZpbGVzL21lZGlhL2ZkYXdnX2dncGxvdDIuaHRtbCkKICAtIFtEYXRhIE1hbmlwdWxhdGlvbiB3aXRoICBkcGx5cl0oaHR0cHM6Ly93d3cuci1ibG9nZ2Vycy5jb20vZGF0YS1tYW5pcHVsYXRpb24td2l0aC1kcGx5ci8pCiAgLSBbSW50cm8gdG8gZHBseXJdKGh0dHBzOi8vY3Jhbi5yc3R1ZGlvLmNvbS93ZWIvcGFja2FnZXMvZHBseXIvdmlnbmV0dGVzL2ludHJvZHVjdGlvbi5odG1sKQoKCg==