ai-slides.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894

\documentclass[landscape,30pt]{foils}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{color}
\usepackage{subfigure}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage[UTF8]{ctex}

\title{人工智能}

\author{} 
\date{}
\MyLogo{Copyright  \copyright 2018, 2019 蓝珲}

\begin{document}
\maketitle


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{教学大纲}
\begin{itemize}
\item 课本。Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani. {\bf An Introduction to Statistical Learning: with Applications in R}.  [下载http://www-bcf.usc.edu/$\sim$gareth/ISL/ISLR Seventh Printing.pdf]
\item 成绩给予：课堂参与度(30分), 作业(20分), 期末考试(50分)。
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{人工智能}

\begin{itemize}
\item 何谓智能？指人的智慧和行动能力。
\item Intelligence - the ability to acquire and apply knowledge and skills. -Bing Dictionary 获得并应用知识技能的能力。
\item 人工智能 -- 即机器智能，通常通过编程方式实现。始于1956年。
\item 人工智能优势：强大的数据处理、计算能力，人类望尘莫及。不足：缺乏理解能力，无法思考，按照规则行事，无法发现新知识，无法进化。
\item 自然智能 -- 自然进化方式获得的智能，比如人、动物、植物的智能。感知外界信号，并做出反应，能够将反应作为经验，面对新事物能够联系经验进行思考。伽利略，牛顿，欧拉，法拉第等人。
\item 离开真正的人工智能还有多远？ AI is whatever hasn't been done yet. -  Tesler's Theorem
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{人工智能，还是人的智能在编程？}

（至于谷歌所说的）学习能力还没有得到完全验证。 -- 刘慈欣

人类对于自身大脑详细的深层结构和运作方式知之甚少。如果脑科学无法实现重大突破，那产生真正意义上的人工智能就是天方夜谭。 -- 刘慈欣

只要基础科学不能实现重大突破，那么应用型科学的发展是有瓶颈的。 -- 刘慈欣

{\small 摘自新华网《刘慈欣讲述他眼中的“阿尔法围棋”和“人工智能” 》（2016年03月13日稿件）}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{人工智能冬天}

1960年代美国英国研究火爆。由于一些难以解决的困难，1970年代政府支持缩减，人工智能研究进入寒冬。见英国Lighthill报告（1973）。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{新闻}

\begin{center}
\includegraphics[width=0.95\textwidth]{AI_new_5MAR2018}
\end{center}

``Cognitive learning is all about teaching computers to learn {\em without having to explicitly program them},'' Rad said. 

``So how do we become better? {\em We learn from experience}.'' Rad said. {\color{blue}Context is important.}  

Children, for example, begin by identifying objects such as faces and toys, then move on from there to {\em understand} communication.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{当前人工智能子领域}

机器人（亚马逊分拣货物），机器学习（1990年代SVMs，2010年代深度学习），自然语言处理（阿富汗、伊拉克战场），策略游戏（1997年深蓝国际象棋，2016年阿尔法围棋），计算机视觉，无人驾驶。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{人工智能确实是有用的-比如无人驾驶汽车（driverless cars，autonomous vehicles）}
借助GPS，雷达，激光，网络，传感器，摄像机，计算机，公路配套。 “我在哪里？”“我周边有什么？”“我应该怎么做？”--  Paul Newman
\begin{itemize}
\item 省时间。英格兰每个司机平均年驾驶235小时。
\item 减少拥堵，降低油耗。根据周边路况，车况计算出最佳驾驶策略。车与车之间有通讯。
\item 没有驾照人的福音。选定起点与终点，会有车来接你。
\item 减少交通事故。打电话，开小差，赶时间，误判，盲区，违规。
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{英国的情况}

目标：全球无人驾驶汽车的领导者。2020年-2030年间投入使用。更安全，更智能，更平顺。

2016年7月11日 - \href{https://www.gov.uk/government/news/new-measures-to-help-britain-lead-the-way-in-developing-driverless-technology}{政府设立3千万英镑智能移动基金。相应地更改交通法规、保险}。

2016年10月13日 - \href{https://www.newscientist.com/article/2108977-first-uk-trial-of-driverless-pods-paves-way-for-autonomous-taxis/}{2座无人驾驶出租车在Milton Keynes人行道测试，加载iPad，时速8千米。}
\begin{center}
\includegraphics[width=0.8\textwidth]{Rex_Shutterstock}
\end{center}
{\small Rex/Shutterstock}

2017年4月24日 - \href{http://www.bbc.com/news/technology-39691540}{Oxbotica为首的几家英国公司联合体将于2019年在英国车道上测试无人驾驶汽车}。伦敦牛津之间的公共车道上，多辆汽车可以互相通报情况（会有人在上面）。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{统计学习历史}

\begin{itemize}
\item 线性方法。19世纪初， 勒让德（Legendre）与高斯（Gauss）发表关于{\bf 最小二乘法方法 （Least Squares）}论文，主要应用在天文学上，是一种线性回归方法。1936年，Fisher提出线性判别分析{\bf Linear Discriminant Analysis (LDA)}。1940年代，{\bf Logistic Regression (LR)}。1970年代，大多数方法是线性方法，Nelder与Wedderburn发明Generalized Linear Models，包括LDA与LR。
\item 拟合非线性方法需要计算量很大。随着计算机的发明，1980年代出现了非线性方法与计算机实现。Breiman, Friedman, Olshen与Stone提出分类与回归树（classification and regression trees）。Hastie与Tibshirani在1986年发明Generalized Additive Models。
\item 重要参考书籍Hastie、Tibshirani与Friedman的《The Elements of Statistical Learning》（2001年，2009年）。 
\item R语言。R Studio。很多有用的、常用的软件包。免费！
\item 应用非常广泛：商业，健康，生物学，遗传学，心理学，地球科学，社会科学，公共政策。数据越来越多，统计学习会越来越有用。
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{统计学习（Statistical Learning）}

\begin{itemize}
\item {\bf 学习（learning）是困难的，过程是漫长的。}收集数据是困难的，分析（理解）数据也是困难的。
\item 有监督学习 - supervised learning - （用于建立模型、预测），无监督学习 - unsupervised learning - （用于发现数据的内在结构）。
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{课程目标}
\begin{itemize}
\item 介绍一些有用的统计学习方法、模型，直觉（intuition），假设条件，优缺点（没有一种方法是最好的）。应用范围广泛，不局限于统计。
\item KNN (K-nearest neighbor)，简单分类。线性回归，拟合与预测。LR，LDA，用于分类。控制过度拟合，ridge与lasso。分类树与回归树。
\item 集成方法（ensemble methods）: bagging, stacking, boosting, random forests。
%% \item SVMs.
%%\item 无监督学习方法： principal components analysis，K-means clustering， hierarchical clustering。
\item 性能测试方法。cross-validation与bootstrap。
%% \item 直觉力（intuition）。
\item 应用这些方法到实际问题。
\end{itemize}
%% 材料：http://www-bcf.usc.edu/~gareth/ISL/
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{数学符号}
\begin{itemize}
\item $n$个数据点， $x_i, i=1,2, ..., n$.
\item 每个数据点在$p$个变量上有观测值，$x_i = \begin{pmatrix} x_{i1} \\ x_{i2} \\ \vdots \\ x_{ip} \end{pmatrix}$.
\item 所有的数据可以看成是一个$n \times p$矩阵 $\mathbf{X} = \begin{pmatrix} x_{1}^T \\ x_{2}^T \\ \vdots \\ x_{n}^T \end{pmatrix}$. $T$代表转置。
\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{实现语言R}
推荐软件R Studio。有很多软有用的、强大的软件包，如MASS。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{例子:越花时间学习成绩越好吗？}
调查30个学生的某门课程成绩与他们每星期所花时间数的关系。% simulate_grade_hours.R
\begin{center}
\includegraphics[width=0.8\textwidth]{grade_vs_hours_linear}
\end{center}
\begin{center}
\includegraphics[width=0.8\textwidth]{grade_vs_hours_nonlinear}
\end{center}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{工资与年龄，年度，教育水平的关系}
\begin{center}
\includegraphics[width=0.9\textwidth]{ISLR_Fig1_1.png}
\end{center}
3000个美国人（中部大西洋区）的工资数据。工资与年龄，年度，教育水平都有关系。工资是输出（output，response variable），年龄年度教育水平是输入(input，predictor variable)。工资可以看成是连续值（continuous）。回归（regression）问题。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{股市数据}

给定前5天的某股票变化情况，能否预测明天股票增长情况，涨还是跌。输出只有两种情况，涨或跌，是离散值（categorical，qualitative）。
\begin{center}
\includegraphics[width=0.9\textwidth]{ISLR_Fig1_2.png}
\end{center}
Standard \& Poor公司的每天股票指数变化率。1250天的数据，其中有648天其后一天股票升了，602天其后一天股票降了。分类问题（classification）。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{基因表达数据}

聚类（clustering）问题。6830个基因在64个癌细胞系（cell lines）中的表达数据。对64个细胞系进行聚类。

\begin{center}
\includegraphics[width=0.9\textwidth]{ISLR_Fig1_4.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{思考问题}

天然智能（Natural Intelligence）是进化所产生的生物体的智能。人工智能真的智能吗？比如阿尔法围棋。如果是，请说明理由。如果不是，相对于天然智能，现在人工智能有什么优势，有什么需要改进的地方。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{什么是统计学习}

{\Large $Y = f(X) + \epsilon$.}

$X$是输入变量（input variable），$Y$是输出变量(output variable, response variable)，$f$是隐含的、真正的函数，$\epsilon$是误差项，与$X$无关。 统计学习就是一组用来测算$f$的方法，测算得到$\hat{f}$。 应用$\hat{f}$到$X$，我们有$\hat{Y} = \hat{f} (X)$，称为预测值.

期望$E[(Y-\hat{Y})^2]$的大小？特殊情况，$f = \hat{f}$.

\begin{align*}
E[(Y-\hat{Y})^2] &= E[(f(X) + \epsilon - \hat{f}(X))^2] \\
&= E[(f(X) - \hat{f}(X) + \epsilon)^2] \\
&= \small E[(f(X) - \hat{f}(X))^2] \\
  &+ 2 \cdot E[\epsilon] \cdot E[f(x) - \hat{f}(X)] + E[\epsilon^2] \\
&= E[(f(X) - \hat{f}(X))^2] + \mathrm{Var}[\epsilon],\\
&\mathrm{Since} \;\; E[\epsilon]=0, \mathrm{Var}[\epsilon] = E[\epsilon^2] - E[\epsilon]^2.
\end{align*}

称$\mathrm{Var}[\epsilon] = \sigma^2$不可减少（irreducible）。无论$\hat{f}$有多好，样本误差$\epsilon$总存在。为什么？重要的变量没有测量、有些变化无法测量。比如，我们跟据病人血液中的各种元素的含量预测他对某一个药物的不良反应（adverse reaction）。但是不良反应也可能跟其他因素有关，比如药物制备工艺不稳定导致的变化（manufacturing variation），或病人的服药心情。

前一项$E[(f(X) - \hat{f}(X))^2]$则可以减少，比如，用适合数据一点的模型。实际应用中，我们只有数据$x$, $y$，$f$未知，所以$\epsilon$未知。

事实上，$E[(f(X) - \hat{f}(X))^2]$可以拆分成$\hat{f}$的偏差（Bias）与$\hat{f}$的方差（Variance），即$\mathrm{Bias}(\hat{f})^2 + \mathrm{Var}(\hat{f}) = E[(f - E[\hat{f}])^2] + E[(\hat{f} - E[\hat{f}])^2]$。其中$f=f(X)$，$\hat{f} = \hat{f}(X)$。

Bias代表所估算的$\hat{f}$的期望与真实$f$的距离。

Variance代表$\hat{f}$针对不同训练数据（traning data）的稳定性。

\begin{center}
\includegraphics[width=0.99\textwidth]{ISLR_Fig2_9.png}
\end{center}

练习：将$E[(f(X) - \hat{f}(X))^2]$拆分成$\hat{f}$偏差的平方与方差之和。提示：$E[(f-\hat{f})^2] = E[(f - E[\hat{f}] + E[\hat{f}] - \hat{f})^2]$。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{广告方式与销量}

通过电视（TV），电台（radio），报纸（newspaper）做广告，卖某件商品。收集200个市场的销量数据与广告花销数据。哪种广告方式好，即与销量关联最大？给定一个广告策略，能否预测销量。

输入变量$X=(X_1, X_2, X_3)$是3种广告花销，输出变量$Y$是销量。

$X$有很多叫法: variable, input variable, predictor，independent variable, feature.

$Y$的叫法: response variable, depedent variable.

\begin{center}
\includegraphics[width=0.99\textwidth]{ISLR_Fig2_1.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{收入与受教育年限}

仿真数据（simulated data）

\begin{center}
\includegraphics[width=0.58\textwidth]{ISLR_Fig2_2a.png}
\end{center}

\begin{center}
\includegraphics[width=0.55\textwidth]{ISLR_Fig2_2b.png}
\end{center}

事实上，收入与工作年限也有关系。

\begin{center}
\includegraphics[width=0.80\textwidth]{ISLR_Fig2_3.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{推断（inference）}

有时候我们不那么关心预测问题。而是关心各个输入变量$X_1, X_2, ..., X_p$与输出变量$Y$的关系（relationship）。比如写成一个数学等式。

\begin{itemize}
\item 有些$X_i$与$Y$没有关系，有些有，如何确定？
\item 有关系的，是什么关系，正面的（positive），还是负面的（negative）？
\item 更复杂的，$Y$与$X_i$的关系还可能依赖于$X_j$, $i \ne j$。
\item $X_1, X_2, ..., X_p$与$Y$是线性关系，还是非线性关系？
\end{itemize}

例如，广告的销量数据。我们可以问如下的问题：（1）哪些广告方式有用？（2）哪个最有用？（3）增加电视广告投入$\Delta$个单位，能够期望获得多少单位的销量增长？

其他例子：房子价格，商品好卖程度。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{估计$f$得到$\hat{f}$}

假定数据背后的函数是$f$，有$y_i = f(x_i) + \epsilon_i$。$f$通常未知。

用来训练模型的数据称为训练数据（training data）：$\{(x_1, y_1), ..., (x_n, y_n)\}$。

其中第$i$个观察值$x_i = (x_{i1}, x_{i2}, ..., x_{ip})^T$。

用来测试训练好的模型$\hat{f}$的数据称为测试数据。

目的：利用统计学习方法在训练数据上训练得到$\hat{f}$，使之与真正的$f$越接近越好。如何衡量接近与否？用测试数据。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{有参方法，无参方法}

统计学习方法可以分为有参（parametric）与无参（non-parametric）两种方法。

有参方法:（1）假定模型。如$\hat{f}(X) = \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \cdots + \beta_p X_p$。（2）代入训练数据，算出参数$\beta_0, \cdots, \beta_p$。方法：least squares （linear regression）。 如何获得$\beta_0, ..., \beta_p$的值？ 最小化$\mathrm{MSE} = \frac{1}{n} \sum_{i=1}^n (y_i - \hat{f}(x_i))^2$。

$\mathrm{income} = \beta_0 + \beta_1 \times \mathrm{education} + \beta_2  \times \mathrm{seniority}$.

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig2_4.png}
\end{center}

无参方法则不事先假定模型。只是在保证一定平滑性的前提下让$\hat{f}$与训练数据越接近越好。好处是不需要作（可能不那么合理的）假设，坏处要用更多训练数据。训练时需要设定平滑度（smoothness）。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig2_6.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{模型的灵活性与可解释性}

相对于非线性方法，线性方法产生的$\hat{f}$具有更高的可解释性。lasso方法使得很多参数变为0。

如果目的是推断（inference），就用可解释性好的模型。

如果目的是预测（prediction），最好就用灵活点的模型。但是有时候简单的模型却会更好，因为太灵活的模型会出现{\bf 过度拟合（overfitting）}情况。

过度拟合：训练错误小，测试错误大。

\begin{center}
\includegraphics[width=0.95\textwidth]{interpretability_vs_flexibility.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{有监督学习，无监督学习}

有监督学习（supervised learning），训练数据有响应变量$y_i$。要建立预测变量（predictor variables）与响应变量之间的关系。通常用来预测。例子：least squares, logistic regression, SVMs, boosting。

\begin{center}
\includegraphics[width=0.65\textwidth]{ISLR_Fig2_2b.png}
\end{center}


无监督学习（unsupervised learning），则没有响应变量$y_i$来“监督”学习。 clustering，观测点是否呈现清晰聚类，属于哪个聚类。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig2_8.png}
\end{center}

两个变量的情况我们可以用散点图（scatterplot）来查看，如果有$p$个变量呢？

半监督学习（semi-supervised learning）。有些数据点有响应变量（标记），有些则无（标记）。获得标记的成本比较大。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{回归问题，分类问题}

两类响应变量，一类是定量的（如年龄，高度，收入），一类是定性的（也称
  categorical），如性别，品牌，癌症类型。

定量的响应变量一般对应回归问题，least squares。

定性的响应变量一般对应分类问题， logistic regression。

预测变量是定量还是定性则没有什么关系。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{衡量模型准确度}

没有免费午餐理论：没有一种方法在所有数据集上是最优。

评价拟合质量 $MSE = \frac{1}{n} \sum (y_i - \hat{f}(x_i))^2$。可以在训练数据上看，也可以在测试数据（未见数据）上看。注意，$\hat{f}$一定是在训练数据上得出的。我们更关心的是$\hat{f}$在测试数据$(x_0, y_0)$上的表现。$\hat{f}(x_0)$是否接近$y_0$？ $(x_0, y_0)$是训练$\hat{f}$时没有见过的数据。

$\mathrm{test MSE} = \mathrm{Ave}(y_0 - \hat{f}(x_0))$才能真正看出$\hat{f}$好不好。$\mathrm{training MSE}$则看不出。

$\mathrm{training MSE}$可以是0，而$\mathrm{test MSE}$却很大。这就是过度拟合（overfitting）。

\begin{center}
\includegraphics[width=0.99\textwidth]{ISLR_Fig2_9.png}
\end{center}

能够解释每条线，每个点的含义，变化趋势。$\mathrm{test MSE}$呈现U型。训练错误绝大多数情况都小于测试错误，因为各种统计学习的方法在设计上总是直接或间接最小化训练错误的。

过度拟合现象：在训练数据上表现好，在测试数据上表现差。原因：没有捕捉真正的趋势，而是去拟合随机产生的趋势了。简单一点的模型反而不容易出现过度拟合现象。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{测试错误U型曲线}

模型灵活性（复杂度）变大，而测试错误不升反降。这是为什么？

统计学习中有两个竞争属性：Bias和Variance。模型灵活性变大，常意味着Variance变大，Bias变小，我们要考察相对变化幅度。

重要公式 expected test MSE：

\begin{center}
\includegraphics[width=0.99\textwidth]{Equation_2_7.png}
\end{center}

上述等式说的是在测试数据$(x_0, y_0)$上$\hat{f}$的MSE期望值由3部分组成。在很多组训练数据上训练很多$\hat{f}$。所以，$\hat{f}(x_0)$是随机变量。

$\mathrm{Var}(\epsilon)$是 expected test MSE的下限。

Variance指的$\hat{f}$的变化幅度（在不同训练数据上）。

Bias指的是$\hat{f}$的期望离开真正函数$f$的距离。

Bias-variance权衡（trade-off）: 很容易就获得一个$\hat{f}$，bias很小，但variance很大。 

\begin{center}
\includegraphics[width=0.5\textwidth]{ISLR_Fig2_12.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{分类情况}

$y_i$是类别。

训练错误率（training error rate）：$\frac{1}{n}\sum I(y_i \ne \hat{y}_i)$，测试错误率$\mathrm{Ave}(I(y_0 \ne \hat{y}_0))$。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{贝叶斯分类器（The Bayes classifier）}

将$x_0$归为$Pr(Y = j | X = x_0)$最大的那个类$j$，这样会让$\mathrm{Ave}(I(y_0 \ne \hat{y}_0))$最小。这个错误叫做贝叶斯错误。

A. Zisserman
\begin{center}
  \includegraphics[width=0.92\textwidth]{bayes_decision_rule.png}
\end{center}

$x$是二维的情况

\begin{center}
  \includegraphics[width=0.85\textwidth]{ISLR_Fig2_13.png}
\end{center}


{\small
\begin{align*}
  p(error)   &= \int_{R1} Pr(Y=1|X=x)Pr(X=x) dx \\
  & + \int_{R2} Pr(Y=0|X=x) Pr(X=x) dx \\
  &= \int_{Pr(Y=0|X=x) \ge Pr(Y=1|X=x)} Pr(Y=1, X=x) dx  \\
  & + \int_{Pr(Y=1|X=x) \ge Pr(Y=0|X=x)} Pr(Y=0, X=x) dx \\
  &= 1 - E_{x}[\mathrm{max}_j \underline{Pr(Y=j|X=x)}], 
\end{align*}
}%

其中$E_x$是正确分类的概率。
贝叶斯决定边界（the Bayes decision boundary）就是$x_0$。

贝叶斯错误是分类问题测试错误的下界。

实际情况下我们无法得知贝叶斯错误，因为我们不知道条件概率。

可以类比回归问题中的无法消除的错误（irreducible error）。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{KNN分类器}

给定$x_0$，找到最近的$K$个点，看看它们的类别，求出每个类别的概率。

然后将最大概率的那个类别赋给$x_0$。

$Pr(Y=j | X=x_0) = \frac{1}{K} \sum_{i \in N_0} I(y_i = j)$.

\begin{center}
  \includegraphics[width=0.95\textwidth]{ISLR_Fig2_14.png}
\end{center}

\begin{center}
  \includegraphics[width=0.80\textwidth]{ISLR_Fig2_15.png}
\end{center}

\begin{center}
  \includegraphics[width=0.95\textwidth]{ISLR_Fig2_16.png}
\end{center}

\begin{center}
  \includegraphics[width=0.85\textwidth]{ISLR_Fig2_17.png}
\end{center}

选择适当的灵活性（flexibility）很重要。涉及到测试错误的Bias-Variance权衡问题。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{一元线性回归}

$Y=\beta_0 + \beta_1 X_1 + \epsilon$。

在只有一个预测变量$X_1$的情况下\\
\underline{估计参数$\beta_0$、$\beta_1$得到$\hat{\beta_0}$、$\hat{\beta_1}$}。

\begin{center}
  \includegraphics[width=0.90\textwidth]{ISLR_Fig3_1.png}
\end{center}

\begin{center}
  \includegraphics[width=0.70\textwidth]{Advertising_top16.png}
\end{center}


令目标函数为RSS(Residual Sum of Squares)。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_3.png}
\end{center}

RSS对$\hat{\beta_0}$和$\hat{\beta_1}$的偏导数均为0时RSS取最小值。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_4.png}
\end{center}

\newpage
用上述方法得到的参数估计叫做least squares coefficient estimates。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Fig3_2.png}
\end{center}

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Table3_1.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{推导$\hat{\beta_0}$、$\hat{\beta_1}$}

\begin{center}
  \includegraphics[width=0.50\textwidth]{IMG_6988.png}
\end{center}

\begin{center}
  \includegraphics[width=0.50\textwidth]{IMG_6989.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{区别两种拟合直线}

Population regression line. $Y=\beta_0 + \beta_1 X + \epsilon$.

Least squares line. $\hat{Y}=\hat{\beta}_0 + \hat{\beta}_1 X$.

一个无$\hat{}$，是参数。一个有$\hat{}$，是参数估计。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{线性回归参数估计准确性}


\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Fig3_3.png}
\end{center}

参数估计也是随机变量，在不同的训练集上有不同的参数估计。参数估计的标准
错误（Standard Error） $\mathrm{SE}(\hat{\beta})$。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_8.png}
\end{center}

上述公式的成立条件是： $\epsilon_i$ 之间不相关， $\epsilon_i$ 的Variance都是 $\sigma$。

$\sigma$ 通常未知，可以用Residual Standard Error = $\mathrm{RSE}= \sqrt{\mathrm{RSS}/(n-2)}$ 代替。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{置信区间}

95\%置信区间（confidence interval）。$[\hat{\beta_1} - 2 \cdot \mathrm{SE}(\hat{\beta_1}), \;\; \hat{\beta_1} + 2 \cdot \mathrm{SE}(\hat{\beta_1})]$。 只有5\%的概率真正的$\beta_1$会落在这个区间\underline{外面}。

\begin{center}
  \includegraphics[width=0.62\textwidth]{DeriveConfidenceInterval.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{假设检验}

$H_0$：$X$与$Y$\underline{没有}关系。$H_a$：$X$与$Y$有关系。

$H_0: \beta_1 = 0$。$H_1: \beta_1 \ne 0$。

t-statistic: $\displaystyle t = \frac{\hat{\beta_1} - 0}{\mathrm{SE}(\hat{\beta_1})}$.

p-value: $H_0$是真时观察到大于等于$t$的值的概率。如果p-value很小，就是$H_0$是真的可能性很小，也就是拒绝了$H_0$。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Table3_1.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{评价模型准确性}

RSE - 拟合度（lack of fit）- 劣势.

$R^2$ statistic. $\displaystyle R^2 = \frac{\mathrm{TSS}-\mathrm{RSS}}{\mathrm{TSS}} = \frac{\sum (y_i - \bar{y})^2 - \sum (y_i - \hat{y_i})^2}{\sum (y_i - \bar{y})^2} = \frac{\sum (\hat{y_i} - \bar{y})^2}{\sum (y_i - \bar{y})^2}$. 值在0-1之间，与$Y$的规模（scale）无关。 

\begin{center}
  \includegraphics[width=0.99\textwidth]{least_squares_geometric_interpretation.png}
\end{center}

$\mathrm{TSS} = \mathrm{RSS} + \mathrm{R'SS} \iff \sum (y_i - \bar{y})^2 = \sum (y_i - \hat{y_i})^2 + \sum (\hat{y_i} - \bar{y})^2$ ?

$\sum (y_i - \bar{y})^2 = \sum y_i^2 - 2\bar{y}\sum y_i + n\bar{y}^2$

$\sum (\hat{y_i} - \bar{y})^2 = \sum \hat{y_i}^2 - 2\bar{y}\sum \hat{y_i} + n\bar{y}^2$
{\small
\begin{align*}
  \sum (y_i - \bar{y})^2  - \sum (\hat{y_i} - \bar{y})^2 &= \sum y_i^2 - \sum \hat{y_i}^2 - 2\bar{y} \sum (y_i - \hat{y_i}) \\
  &= \sum y_i^2 - \sum \hat{y_i}^2 \\
  &= \sum \epsilon_i^2 \\
  &= \sum (y_i - \hat{y_i})^2
\end{align*}
}%
因为$||y||^2 - ||\hat{y}||^2 = ||\epsilon^2||$ [几何解释] 且 $\sum (y_i - \hat{y_i}) = 0$ [令RSS对$\beta_0$的偏导数等于0]。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{$R^2$图示}

{\small https://onlinecourses.science.psu.edu/stat501/node/255}

\begin{center}
  \includegraphics[width=0.99\textwidth]{situation_1_plot.png}
\end{center}

\begin{center}
  \includegraphics[width=0.99\textwidth]{situation_2_plot.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{$R^2$ 与 $r^2$}

$r=Cor(X,Y)$是$X$与$Y$的相关性系数(的估计)

\begin{center}
  \includegraphics[width=0.99\textwidth]{Equation_3_18.png}
\end{center}

在只有一个预测变量$X$的情况下，$R^2 = r^2$。

注意到$\displaystyle R^2 = \frac{\sum (\hat{y_i} - \bar{y})^2}{\sum (y_i - \bar{y})^2}$，$\hat{y_i} = \hat{\beta_0} + \hat{\beta_1} x_i$。将$\hat{\beta_0}$与$\hat{\beta_1}$ 的值带入化解，就得到3.18式子。

{\tiny
\begin{align*}
  \sum (\hat{y_i} - \bar{y})^2 &= \sum (\hat{\beta_0} + \hat{\beta_1} x_i - \bar{y})^2 \\
  &= \sum (\bar{y} - \hat{\beta_1} \bar{x} + \hat{\beta_1} x_i - \bar{y})^2 \\
  &= \hat{\beta_1}^2 \sum (x_i - \bar{x})^2 \\
\end{align*}
}%

\begin{center}
  \includegraphics[width=0.60\textwidth]{ISLR_Eq3_4.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{一元线性回归中$\hat{\beta_1}$ 与 $r$}

$\displaystyle \hat{\beta_1} \times \frac{\sqrt{\sum (x_i - \bar{x})^2}}{\sqrt{ \sum (y_i - \bar{y})^2}} = \hat{\beta_1} \times \frac{\mathrm{SD}(X)}{\mathrm{SD}(Y)} = r$.

所以，对$X$与$Y$标准化后，$r$与$\hat{\beta_1}$值一样。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{示例}


{\tiny https://pubs.rsna.org/doi/pdf/10.1148/radiol.2273011499}

\begin{center}
  \includegraphics[width=0.95\textwidth]{radiol_2273011499_figure1.png}
\end{center}


显著性检验(significance test): $\displaystyle t=r \sqrt{\frac{n-2}{1-r^2}}$.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{多个预测变量的线性回归}

含有一个以上的预测变量(predictor variables)。

\begin{center}
  \includegraphics[width=0.70\textwidth]{ISLR_Fig3_4.png}
\end{center}

\begin{center}
  \includegraphics[width=0.90\textwidth]{ESLII_print10_page12.png}
\end{center}

$\hat{\beta} = (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{y}$.

$\mathbf{\hat{y}} = \mathbf{X} \hat{\beta} = \mathbf{X} (\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{y} = \mathbf{H} \mathbf{y}$.

\begin{center}
  \includegraphics[width=0.98\textwidth]{ISLR_Table3_4.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{多元回归结果与一元回归结果}

注意到多元回归中，newspaper那项的系数几乎为0（-0.001）。

如果只做sales与newspaper的一元回归，\\
newspaper系数（0.055）非零。

原因：radio与newspaper有相关性（0.35）。

对鲨鱼袭击事件$Y$与冰激凌销量$X$做一元回归分析，系数大于0。

是不是就说要在沙滩禁止卖冰激凌呢？

不是，主要是因为气温。气温高，去沙滩的人多，卖出的冰激凌就多，发生鲨鱼攻击人的事件也多。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{多元回归分析中的重要问题}

\begin{itemize}
\item $X_1, ..., X_p$中有对预测$Y$有用的变量吗？

$H_0: \beta_1 = \beta_2 = \cdots = \beta_p = 0$

$H_a: \exists \;\; \beta_j \ne 0$

$\mathrm{RSS_{restricted \, model}}$, 1 parameter (only the intercept $\beta_0$), $n-1$ df

$\mathrm{RSS_{unrestricted \, model}}$, $p+1$ parameters, $n-p-1$ df

$F = \displaystyle \frac{(\mathrm{RSS_{\mathrm{restricted \, model}}} - \mathrm{RSS_{\mathrm{unrestricted \, model}}})/p}{\mathrm{RSS_{\mathrm{unrestricted \, model}}}/(n-p-1)}$

分子服从自由度为$p$的chi square分布，分母服从自由度为$n-p-1$的chi square分布，所以$F$服从$F_{p, n-p-1}$分布。
  
\newpage
$\displaystyle F\mathrm{-statistic}: \;\; F = \frac{(\mathrm{TSS}-\mathrm{RSS})/p}{\mathrm{RSS}/(n-p-1)} = \frac{R^2/p}{(1-R^2)/(n-p-1)}$

如果$H_0$是真，$F$值接近于1。sales与TV，radio，newspaper例子中，$F=570$。所以拒绝$H_0$。

$F$多大才好？与样本数$n$有关。$n$大，$F$即使较小也有意义。

只关心$q$个变量是否全0的情况，\\
就用 $\displaystyle F=\frac{(\mathrm{RSS}_0 - \mathrm{RSS})/q}{\mathrm{RSS}/(n-p-1)}$.

$q=1$是特殊情况。此时的$F$值代表将这个预测变量加回去的影响(partial effect)。

$q=1$时，$F \sim F_{1, n-p-1}$。如果$X \sim t(n-p-1)$，那么$X^2 \sim F_{1, n-p-1}$。

如果预测变量个数$p$大于样本数$n$，那么$F$就无法使用了。

\newpage
预测变量很多($p=100$)，多元回归分析中小$p-value$会产生，即便预测变量与响应变量没有关系。

{\tiny
\begin{verbatim}
# Purpose: "we expect to see approximately five small p-values even in the absence of
# any true association between the predictors and the response"
# Created by Hui Lan on 19 April 2018 (zjnu)

N <- 1000 # number of data points
p <- 100  # number of dimensions
X <- cbind(rep(1,N), matrix(rnorm(N*p), N, p))
beta = matrix(c(12, rep(0, p)), p+1, 1)
y = X %*% beta + rnorm(N,0,0.5) # H0 is true

model <- lm(y ~ X)
summary(model)


######### Output ###########

Coefficients: (1 not defined because of singularities)
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.0065970  0.0169376 708.872  < 2e-16 ***
X1                  NA         NA      NA       NA    
X2           0.0178357  0.0166013   1.074  0.28295    
X3           0.0265815  0.0164173   1.619  0.10577    
X4           0.0494653  0.0169262   2.922  0.00356 ** 
X5           0.0078647  0.0168164   0.468  0.64013    
X6           0.0277616  0.0174029   1.595  0.11101    
X7           0.0134196  0.0167375   0.802  0.42290    
X8           0.0270449  0.0171876   1.574  0.11595    
X9           0.0079501  0.0167443   0.475  0.63505    
X10         -0.0123964  0.0175507  -0.706  0.48017    
X11         -0.0144577  0.0165945  -0.871  0.38386    
X12          0.0235717  0.0164857   1.430  0.15311    
X13          0.0096259  0.0167328   0.575  0.56525    
X14         -0.0124179  0.0173346  -0.716  0.47395    
X15          0.0054849  0.0169616   0.323  0.74649    
X16          0.0133782  0.0170321   0.785  0.43239    
X17          0.0298977  0.0165359   1.808  0.07093 .  
X18          0.0171318  0.0174862   0.980  0.32748    
X19          0.0141900  0.0170608   0.832  0.40578    
X20          0.0114698  0.0170003   0.675  0.50005    
X21         -0.0146711  0.0169974  -0.863  0.38829    
X22          0.0166242  0.0170126   0.977  0.32875    
X23          0.0064860  0.0170476   0.380  0.70369    
X24          0.0058865  0.0169134   0.348  0.72789    
X25          0.0067248  0.0166322   0.404  0.68607    
X26          0.0201784  0.0167482   1.205  0.22859    
X27          0.0095910  0.0173812   0.552  0.58122    
X28         -0.0026596  0.0168463  -0.158  0.87459    
X29         -0.0259208  0.0167510  -1.547  0.12211    
X30          0.0075856  0.0173363   0.438  0.66181    
X31          0.0048453  0.0171380   0.283  0.77745    
X32         -0.0122234  0.0167718  -0.729  0.46631    
X33          0.0052291  0.0164930   0.317  0.75128    
X34          0.0132872  0.0173065   0.768  0.44283    
X35          0.0017958  0.0173947   0.103  0.91780    
X36          0.0044950  0.0168204   0.267  0.78935    
X37         -0.0022061  0.0171846  -0.128  0.89788    
X38          0.0128208  0.0168993   0.759  0.44825    
X39          0.0122171  0.0173370   0.705  0.48119    
X40         -0.0056949  0.0168950  -0.337  0.73614    
X41          0.0129821  0.0170021   0.764  0.44533    
X42         -0.0037633  0.0171881  -0.219  0.82674    
X43         -0.0344980  0.0178071  -1.937  0.05302 .  
X44          0.0149632  0.0167351   0.894  0.37150    
X45         -0.0084597  0.0173323  -0.488  0.62560    
X46          0.0246508  0.0166288   1.482  0.13858    
X47         -0.0092400  0.0162326  -0.569  0.56934    
X48         -0.0128468  0.0176364  -0.728  0.46654    
X49          0.0310793  0.0163969   1.895  0.05835 .  
X50          0.0122241  0.0168520   0.725  0.46841    
X51         -0.0017788  0.0168488  -0.106  0.91595    
X52          0.0095988  0.0171531   0.560  0.57589    
X53         -0.0129008  0.0169640  -0.760  0.44717    
X54          0.0033494  0.0168667   0.199  0.84263    
X55         -0.0019071  0.0170247  -0.112  0.91083    
X56         -0.0179975  0.0172345  -1.044  0.29664    
X57         -0.0014811  0.0175148  -0.085  0.93263    
X58         -0.0129828  0.0163702  -0.793  0.42795    
X59          0.0186103  0.0174296   1.068  0.28593    
X60         -0.0099381  0.0174525  -0.569  0.56920    
X61         -0.0056888  0.0170587  -0.333  0.73885    
X62         -0.0151715  0.0172610  -0.879  0.37967    
X63          0.0072265  0.0168380   0.429  0.66789    
X64         -0.0246899  0.0170229  -1.450  0.14730    
X65         -0.0052167  0.0169090  -0.309  0.75776    
X66          0.0038626  0.0172589   0.224  0.82296    
X67          0.0263086  0.0170381   1.544  0.12291    
X68          0.0014927  0.0172881   0.086  0.93122    
X69          0.0125356  0.0170363   0.736  0.46203    
X70          0.0195364  0.0173074   1.129  0.25929    
X71          0.0435482  0.0173629   2.508  0.01231 *  
X72         -0.0377053  0.0173973  -2.167  0.03047 *  
X73         -0.0016825  0.0166377  -0.101  0.91947    
X74         -0.0244445  0.0170528  -1.433  0.15207    
X75          0.0017370  0.0171624   0.101  0.91941    
X76         -0.0156463  0.0168771  -0.927  0.35414    
X77          0.0111840  0.0169061   0.662  0.50844    
X78         -0.0134714  0.0173918  -0.775  0.43879    
X79          0.0008609  0.0170256   0.051  0.95968    
X80          0.0022467  0.0174597   0.129  0.89764    
X81         -0.0040672  0.0165622  -0.246  0.80607    
X82          0.0361365  0.0172968   2.089  0.03697 *  
X83          0.0061030  0.0168264   0.363  0.71691    
X84         -0.0104370  0.0167507  -0.623  0.53339    
X85         -0.0101807  0.0165662  -0.615  0.53901    
X86         -0.0121411  0.0168796  -0.719  0.47216    
X87         -0.0078170  0.0175655  -0.445  0.65642    
X88         -0.0181524  0.0166127  -1.093  0.27483    
X89          0.0322584  0.0171121   1.885  0.05974 .  
X90          0.0214747  0.0171809   1.250  0.21166    
X91         -0.0118408  0.0165181  -0.717  0.47366    
X92         -0.0150087  0.0175115  -0.857  0.39163    
X93          0.0156271  0.0168891   0.925  0.35507    
X94          0.0181387  0.0171322   1.059  0.29000    
X95          0.0525594  0.0165603   3.174  0.00156 ** 
X96         -0.0241572  0.0170478  -1.417  0.15682    
X97          0.0001008  0.0173618   0.006  0.99537    
X98          0.0224724  0.0173428   1.296  0.19538    
X99          0.0229057  0.0174303   1.314  0.18914    
X100        -0.0155684  0.0171128  -0.910  0.36320    
X101        -0.0107134  0.0170035  -0.630  0.52881    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5108 on 899 degrees of freedom
Multiple R-squared:  0.1014,	Adjusted R-squared:  0.001392 
F-statistic: 1.014 on 100 and 899 DF,  p-value: 0.4472

\end{verbatim}
}

\item 所有预测变量都有用，还是只是一些有用？

\underline{加法}(forward selection)。一开始，对每个预测变量做一元回归，然后选那个使得RSS最小的变量加进来。
\underline{减法}(backward selection)。把p-value最大的那个变量拿掉。
\underline{混用}(mixed)。添加变量，去掉不符合p-value要求的变量。
  
\item 模型对数据的拟合度如何？

  $R^2$，解释了的Y方差的\underline{比例}。取0-1之间的值。 \\

  $\displaystyle \mathrm{RSE} = \sqrt{\frac{\mathrm{RSS}}{n-p-1}}$，lack of fit， 失拟。与Y的单位(scale)有关。

\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{预测变量取定性值}

factor, categorical variable.

性别，学生否，婚否。可以用1/0表示，可以用0/1表示，可以用-1/1表示。

有$a$个可能取值，就用$a-1$个虚拟变量表示。

信用卡欠费（balance）与性别（gender）关系。

$\mathrm{balance} = \beta_0 + \beta_1 \times \mathrm{gender} + \epsilon$

如果持有者是女性，$x_i=1$。如果持有者是男性，$x_i=0$。 \\
$\hat{\beta_0} = 509.80$, $\hat{\beta_1} = 19.73$。


\begin{center}
  \includegraphics[width=0.99\textwidth]{CreditCardMaleFemaleBinaryCoding.png}
\end{center}


\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Table3_7.png}
\end{center}

如果持有者是女性，$x_i=0$。如果持有者是男性，$x_i=1$。\\
$\hat{\beta_0} = 529.53$, $\hat{\beta_1} = -19.73$。


虽然两种虚拟变量值不一样，导致$\hat{\beta}$值不一样，但是实质一样。\\
都是男的信用卡欠费均值是509.80。\\
将具体的$x_i$值代入$\hat{y_i} = \hat{\beta_0} + \hat{\beta_1} x_i$即可。

\newpage
{\tiny
\begin{verbatim}
# Purpose: to illustrate the coefficients of a binary variable in regression
# Created by Hui Lan on 20 April 2018 @ zjnu, Jinhua
library(MASS)
library(ISLR)

CD <- Credit # credit card data
index.male <- which(CD$Gender == ' Male')
index.female <- which(CD$Gender == 'Female')
CD$Gender2 <- rep(NA, length(CD$Gender))
CD$Gender2[index.male]   <- 0
CD$Gender2[index.female] <- 1

model <- lm(Balance ~ Gender2, data=CD) 
# In fact, model <- lm(Balance ~ Gender, data=Credit) will just work as well, 
# treating Female as 1 and Male as 0 by default.
summary(model)
plot(CD$Gender2, CD$Balance, pch='o', xlab='Binary Gender (M:0, F:1)', ylab='Credit Card Balance')  # pch - point character
abline(model, lwd=1, col='blue') # lwd - line width,  col - color, not column
abline(h=509.80, col='red')

mean(CD$Balance[index.male])
mean(CD$Balance[index.female])
\end{verbatim}
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{相互作用项(interaction term)}

Additive特性：$X_j$对$Y$的作用与其他变量无关。增加一个单位的TV投入，sales总是增加$\beta_1$，与radio的投入无关。

Linear特性：$X_j$每增加一个单位Y的增加值总是固定的，不取决于$X_j$的值。

\begin{align*}
  Y &= \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \underline{\beta_3 X_1 X_2} + \epsilon \\
  &= \beta_0 + \underline{(\beta_1 + \beta_3 X_2)} X_1 + \beta_2 X_2 + \epsilon \\
  &= \beta_0 + \beta_1 X_1 + \underline{(\beta_2 + \beta_3 X_1)} X_2 + \epsilon
\end{align*}

$X_1$前面的系数依赖于$X_2$的值。$X_2$同理。

协同效应(synergistic effect)：$X_1=5000$，$X_2=5000$产生的效果要大于$X_1=10000$或$X_2=10000$。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Table3_9.png}
\end{center}

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_33.png}
\end{center}

学生与否（student），收入（income），与信用卡欠费（balance）的关系。

\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_34.png}
\end{center}
\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Eq3_35.png}
\end{center}
\begin{center}
  \includegraphics[width=0.99\textwidth]{ISLR_Fig3_7.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{多重共线性(multicollinearity)，VIF}

预测变量间有相关性就会发生多重共线性（multicollinearity）。

会对预测变量系数估算准确性产生不良影响。估算出系数$\beta$的SE会膨胀。
数据稍有变动，估算出的$\beta$就会差很多。

{\small Robert A. Stine, Graphical Interpretation of Variance Inflation Factors, 1995.}

$\displaystyle \mathrm{var}(\hat{\beta_j}) = \frac{\sigma^2}{\sum (x_{ij} - \bar{x}_j )^2 } \mathrm{VIF}_j $.


$\displaystyle \mathrm{VIF}_j = \frac{1}{1 - R^2_{X_j|X_{-j}}}$。VIF越接近1越好，大于5就比较麻烦。

VIF=5，相应的$\beta$的$\mathrm{SE}^2$就要比VIF=1（与其他变量没有相关性）时膨胀5倍。

例子：$Y = 10 + 0.2 X_1 + 4 X_2 + \epsilon$。

$cor(X_1, X_2) \approx 0$。$x_2$改变一点点，对$\beta$的值影响很小。
{\small
\begin{verbatim}
> coef(model.o)
(Intercept) x1.original x2.original 
 10.0266609   0.1986754   3.9889862 

> coef(model.p)
(Intercept) x1.original  x2.perturb 
 10.0144450   0.1951758   3.9922136 

> cor(x1.original, x2.original)
[1] 0.03182626
> cor(x1.original, x2.perturb)
[1] 0.03278774

> vif(model.o)
x1.original x2.original 
   1.001014    1.001014 
> vif(model.p)
x1.original  x2.perturb 
   1.001076    1.001076 
\end{verbatim}
}

$cor(X_1, X_2) \approx 0.99$。$x_2$改变一点点，对$\beta$的值影响很大。
但是对模型的预测值不会有太大影响。
{\small
\begin{verbatim}
> cor(x1.original, x2.original)
[1] 0.9986939
> cor(x1.original, x2.perturb)
[1] 0.9974933

> coef(model.o)
(Intercept) x1.original x2.original 
 9.86885942  0.03890966  4.14690502 
> coef(model.p)
(Intercept) x1.original  x2.perturb 
  11.751708    1.903390    2.266739 

> vif(model.o)
x1.original x2.original 
   383.0822    383.0822 
> vif(model.p)
x1.original  x2.perturb 
   199.7143    199.7143 
\end{verbatim}
}

{\small 计算脚本Multicollinearity\_test.R}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{异常点（outliers），高杠杆作用（high-leverage）的点}

点20是异常点。移除前后拟合的直线没有什么变化。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_12.png}
\end{center}

点41是高杠杆作用点。移除后直线变化大。
\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_13.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq3_37.png}
\end{center}

$h_i$值在$1/n$到$1$之间。

令residual等于$e_i$，studentized residual等于\\
$\displaystyle \frac{e_i}{\hat{\sigma}\sqrt{1-h_i}}$.

影响力大的点。\\
Cook's Distance $D_i = \frac{\sum (\hat{y}_j - \hat{y}_{j|-i})^2}{p \cdot s^2} = \frac{e_i^2}{p \cdot s^2} \cdot \frac{h_i}{(1-h_i)^2}$, where $\displaystyle s^2 = \frac{\sum (y_i - \hat{y_i})^2}{n-p}$。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{多项式回归}

多项式回归(polynomial regression)可以用线性模型表达非线性关系。

方法：新增变量，$X_2 = X^2, X_3 = X^3, X_4 = X^4, X_5 = X^5, ...$

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq3_36.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_8.png}
\end{center}


升到7次方。
\begin{center}
\includegraphics[width=0.95\textwidth]{polynomial_regression_degree8.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{polynomial_regression_degree8_B.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{polynomial_regression_degree8_C.png}
\end{center}

{\tiny
\begin{verbatim}
# Purpose: to demonstrate polynomial regression.
# Created by Hui Lan on 19 April 2018

f <- function(x)
{
    # the true function
    # y <- 0.5 + 1.5*x + 0.5*x^2  # also try quadratic function
    y <- 0.5 + 1.5*x  # y is implicitly returned. don't need a return statement
}

### main ###
sd.err <- 2
N <- 12
x1 <- rnorm(N, mean=5, sd=3)  # my data for the the predictor variable X1
e = rnorm(N, mean=0, sd=sd.err) # error term
y  <- f(x1) + e   # my data for the response variable Y

model <- lm(y~x1) # call linear model fit. use ?lm to get details of this function
summary(model)

D <- data.frame(Y=y, X1=x1, X2=x1^2, X3=x1^3, X4=x1^4, 
                X5=x1^5, X6=x1^6,X7=x1^7)
model.polynomial <- lm(Y ~ X1+X2+X3+X4+X5+X6+X7, data=D)
plot(x1, y, pch='+')
abline(model, lwd=1, col='green')
curve(f, from=min(x1), to=max(x1), col='red', add=T)
num.interval <- 100
new.x <- seq(min(x1),max(x1),length=num.interval)
test.x <- data.frame(X1=new.x, X2=new.x^2, X3=new.x^3, X4=new.x^4, 
                     X5=new.x^5, X6=new.x^6, X7=new.x^7)
lines(new.x,predict(model.polynomial, newdata=test.x), col="blue")
\end{verbatim}
}

\newpage
12个点，11次幂。

\begin{center}
\includegraphics[width=0.98\textwidth]{polynomial_regression_degree11.png}
\end{center}

代码如下。

{\tiny
\begin{verbatim}
# polynomial_regression2.R
# Purpose: to demonstrate polynomial regression using poly()
#
#
# Created by Hui Lan on 26 April 2018


### Functions ###

f <- function(x)
{
    # the true function
    # y <- 0.5 + 1.5*x + 0.5*x^2  # also try quadratic function
    y <- 0.5 + 1.5*x  # y is implicitly returned. don't need a return statement
}


### main ###
sd.err <- 2
N <- 12
x1 <- rnorm(N, mean=5, sd=3)  # my data for the the predictor variable X1
e = rnorm(N, mean=0, sd=sd.err) # error term
y  <- f(x1) + e   # my data for the response variable Y


model <- lm(y~x1) # call linear model fit. use ?lm to get details of this function
model
summary(model)

D <- data.frame(Y=y, X1=x1)
model.polynomial <- lm(Y ~ poly(X1,11), data=D)
par(mfrow=c(1,1))
plot(x1, y, pch='+')
abline(model, lwd=1, col='green')
curve(f, from=min(x1), to=max(x1), col='red', add=T)
num.interval <- 100
new.x <- seq(min(x1),max(x1),length=num.interval)
lines(new.x,predict(model.polynomial, data.frame(X1=new.x)), col="blue")
\end{verbatim}
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Bias-Variance权衡}

Expected test MSE：

\begin{center}
\includegraphics[width=0.99\textwidth]{Equation_2_7.png}
\end{center}

测试数据$(x_0, y_0)$上$\hat{f}$的MSE期望值。在多组训练数据上训练多个$\hat{f}$。$\hat{f}(x_0)$是随机变量。

\newpage
形象理解

https://elitedatascience.com/bias-variance-tradeoff

\newpage

\begin{center}
\includegraphics[width=0.85\textwidth]{bias_variance_tradeoff_illustration_pdf.pdf}
\end{center}

{\small
\begin{verbatim}
# Linear Regression Model
> bias.squared.1
[1] 0.047218
> variance.1
[1] 0.1495648
# Polynomial Regression Model
> bias.squared.2
[1] 0.02411328
> variance.2
[1] 0.6207646
\end{verbatim}
}


{\tiny
\begin{verbatim}
# Purpose: to illustrate bias-variance trade-off
#
#
# Created by Hui Lan on 19 April 2018
# Last modified on 28 April 2018, variance.1 <- var(all.f.hat.1 - f.true) is changed to variance.1 <- var(all.f.hat.1).


### Functions ###

f <- function(x)
{
    # the true function
    y <- 0.5 + 1.5*x + 0.25*x^2  # also try quadratic function
    #y <- 0.5 + 1.5*x  # y is implicitly returned. don't need a return statement
}


### main ###
x0 <- 0.5 # my test data point
f.true <- f(x0)

num <- 9 # number of re-sampling
par(mfrow=c(3,3),mar=c(1,1,1,1))
make.plot <- T
bias.vector <- rep(0, num)
variance.vector <- rep(0, num)
all.f.hat.1 <- rep(0, num) # f hat from linear regressions
all.f.hat.2 <- rep(0, num) # f hat from polynomial regressions
total.points <- 500
x1.population <- rnorm(total.points, mean=0.5, sd=1)
sd.err <- 1.5
e <- rnorm(total.points, mean=0, sd=sd.err)
y1.population <- f(x1.population) + e
for (i in 1:num) {
    # Generate data from true f

    N <- 20
    index <- sample(1:total.points,N)
    x1 <- x1.population[index]  # my data for the the predictor variable X1
    y  <- y1.population[index]   # my data for the response variable Y


    D1 <- data.frame(Y=y, X1=x1)
    model <- lm(Y ~ X1, data=D1)
    f.hat.1 <- predict(model, data.frame(X1=x0))
    all.f.hat.1[i] <- f.hat.1


    D2 <- data.frame(Y=y, X1=x1, X2=x1^2, X3=x1^3, X4=x1^4,
                    X5=x1^5, X6=x1^6,X7=x1^7)
    model.polynomial <- lm(Y ~ X1+X2+X3+X4+X5+X6+X7, data=D2)
    num.interval <- 100
    new.x <- seq(min(x1),max(x1),length=num.interval)
    new.x.df <- data.frame(X1=new.x, X2=new.x^2, X3=new.x^3, X4=new.x^4, X5=new.x^5, X6=new.x^6, X7=new.x^7)
    test.x <- data.frame(X1=x0, X2=x0^2, X3=x0^3, X4=x0^4, X5=x0^5, X6=x0^6, X7=x0^7)
    f.hat.2 <- predict(model.polynomial, newdata=test.x)
    all.f.hat.2[i] <- f.hat.2
    if (make.plot){
        plot(x1, y, pch='+', col='white',
             ylim=c(min(f(x1.population)), max(f(x1.population))),
             xlim=c(min(x1.population), max(x1.population)))
        abline(model, lwd=1, col='green')
        lines(new.x,predict(model.polynomial, newdata=new.x.df), col="blue")
        curve(f, from=min(x1.population), to=max(x1.population), col='red', add=T)
        points(x0, f(x0), pch='o', col='red', cex=2)
        points(x0, f.hat.1, pch='o', col='green', cex=2)
        points(x0, f.hat.2, pch='o', col='blue', cex=2)
    }
}


bias.squared.1 <- (mean(all.f.hat.1) - f.true)^2
variance.1 <- var(all.f.hat.1)

bias.squared.2 <- (mean(all.f.hat.2) - f.true)^2
variance.2 <- var(all.f.hat.2)

bias.squared.1
variance.1
bias.squared.2
variance.2
\end{verbatim}
}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig2_12_all3.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{残差图}

\begin{itemize}
\item 跟踪(tracking)现象。

  $\epsilon_i$与$\epsilon_j$有相关性的情况，会导致$\hat{\beta}$的SE比实际小。

考虑极端的情况，就是把训练数据复制几遍，然后加进来做线性回归。SE会变小！

时间序列数据(time series data)中出现$\epsilon$相关比较常见。

\begin{center}
\includegraphics[width=0.90\textwidth]{ISLR_Fig3_10.png}
\end{center}  

\item 异方差(heteroscedasticity)现象(图见书中96页)。

  $\epsilon$的方差不恒定呈现漏斗型，或随着$|Y|$值的增大而变大。解决方法(1) 取$\sqrt{Y}$或$\mathrm{log}(Y)$, (2) Weighted Least Squares。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_11.png}
\end{center}  

  
  {\tiny http://stat.cmu.edu/~cshalizi/350/lectures/18/lecture-18.pdf}

\begin{center}
\includegraphics[width=0.75\textwidth]{weightedLeastSquaresSimulation1.png}
\end{center}  

\begin{center}
\includegraphics[width=0.75\textwidth]{weightedLeastSquaresSimulation2.png}
\end{center}  


\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{线性回归、KNN比较}

有参(parametric)方法。线性回归的假设是线性关系。如果实际情况非线性，导致结论可疑。如果假设合理，这种parametric方法就很好。

无参(non-parametric)方法。KNN回归。

$\displaystyle \hat{f}(x_0) = \frac{1}{K} \sum_{x_i \in \mathcal{N}_0} y_i$, 其中$\mathcal{N}_0$是$x_0$的数个邻居。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_16.png}
\end{center}

如果$Y$与$X$的真实关系是线性的，那么线性回归就优于KNN。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_17.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_18.png}
\end{center}

如果$Y$与$X$的真实关系是\underline{非线性}的呢？

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig3_19.png}
\end{center}

以上图例显示的是只有一个预测变量的情况，即$p=1$的情况。

$p > 1$呢？

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{维度诅咒(curse of dimensionality)}

维度：预测变量个数。

每个预测变量可以取10个值。只有1个预测变量时，$10^1$个训练点可以全覆盖。
2个预测变量时，需要$10^2$个点。
3个预测变量，$10^3$个点。

维度诅咒：

点数固定(比如$n=100$)，维度越高，一个测试点$x_0$的相近邻居就越少。数据变稀疏(sparse)。

因为维度高，很容易产生过度拟合现象(overfitting)。训练性能好，测试性能差。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig3_20.png}
\end{center}

奥卡姆剃刀原理(Occam's razor)。如无必要，勿增实体。When presented with
competing hypothetical answers to a problem, one should select the
answer that makes the fewest assumptions.


延伸阅读:

{\tiny http://www.visiondummy.com/2014/04/curse-dimensionality-affect-classification}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Logistic regression}


响应变量取0或1。$p(X) = p(Y=1|X)$.

$\displaystyle p(X) = \frac{e^{\beta_0 + \beta_1 X}}{1 + e^{\beta_0 + \beta_1 X}}$ (logistic函数$\frac{e^t}{1 + e^t } = \frac{1}{1+e^{-t}}$)

$\displaystyle odds = \frac{p(X)}{1-p(X)} = e^{\beta_0 + \beta_1 X}$

$\displaystyle \mathrm{log \; odds} = logit = \mathrm{log} \frac{p(X)}{1-p(X)} = {\beta_0 + \beta_1 X}$

$X$每增加一个单位，新odds是旧odds乘以$e^{\beta_1}$。$\displaystyle \mathrm{Odds \; Ratio} = \frac{\mathrm{odds}_1}{\mathrm{odds}_0} = e^{\beta_1}$。

\newpage
\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig4_2}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{估计logistic regression参数}

用最大似然(maximum likelihood)法。

$\displaystyle l(\beta_0, \beta_1) = \prod_{i: y_i=1} p(x_i) \prod_{i':y_{i'}=0} (1-p(x_{i'}))$.

对$\displaystyle l(\beta_0, \beta_1)$取对数，化乘为和，$\displaystyle l'(\beta_0, \beta_1) = \sum_{i=1}^{N} [y_i \mathrm{log}\;p(x_i) + (1-y_i) \mathrm{log}\;(1-p(x_i))]$。 用Newton–Raphson算法解出$\beta$值。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Table4_1}
\end{center}

给定一个$x$值，代入$\displaystyle p(X) = \frac{e^{\beta_0 + \beta_1 X}}{1 + e^{\beta_0 + \beta_1 X}}$就可以预测拖欠概率。

也可以用定性变量(如student)来做logistic regression。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Table4_2}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Table4_2_Predicting.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Multiple Logistic Regression}

多个预测变量(student, income, balance)。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq4_7.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Table4_3.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig4_3.png}
\end{center}

student前面系数是负数，表明如果balance和income固定，\\
那么\underline{是学生}就比\underline{不是学生}拖欠的概率小。

预测变量student与balance有相关性(confounding现象)。是student则信用卡趋向于有更大的balance。总体来说，学生群体有更大的拖欠(default)可能性。

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq4_8_4_9.png}
\end{center}

信用卡公司策略 - 在不知道学生balance的情况下，学生拖欠风险更高。如果balance固定，则非学生拖欠风险更高。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{混杂(confounding)现象}


Confounding: A situation in which a measure of association or
relationship between exposure and outcome is distorted by the presence
of another variable.

上例中，balance可看作是student与default之间的混杂因子(confounder)。


\begin{center}
\includegraphics[width=0.50\textwidth]{confounding_effect_illustration.png}
\end{center}


References:

{\tiny
\begin{verbatim}
3.5 - Bias, Confounding and Effect Modification
https://newonlinecourses.science.psu.edu/stat507/node/34/


Confounding: What it is and how to deal with it
https://www.sciencedirect.com/science/article/pii/S0085253815529748#!


Task 3: Key Concepts about Using Logistic Regression In NHANES
https://www.cdc.gov/nchs/tutorials/dietary/advanced/estimateprevalence/info3.htm
\end{verbatim}
}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{LDA}

LDA - Linear Discriminant Analysis。

用与分类($K \ge 2$)。

$\displaystyle p_k(x) = Pr(Y=k | X=x) = \frac{\pi_k f_k(x)}{\sum_{l}^K \pi_l f_l(x)}$.

$\pi_k$是先验概率(不需要提供$x$的值)。$p_k(x)$是后验概率(需要提供$x$的值)。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{$p=1$，一个预测变量的情况}

{\bf 假定}$f_k(x)$服从高斯分布，表达式如下:

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq4_11.png}
\end{center}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Fig4_4.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{判别式函数(discriminant function)}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq4_13.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{估算参数$\pi_k$, $\sigma$, $\mu_k$}

\begin{center}
\includegraphics[width=0.95\textwidth]{ISLR_Eq4_15.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq4_16.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{$p > 1$，多个预测变量的情况}

多元高斯分布, $X \sim N(\mu, \Sigma)$。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig4_5.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq4_18.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq4_19.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig4_6.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Table4_4.png}
\end{center}

sensitivity = TP/(TP+FN).  如果是拖欠者，预测到是拖欠者的概率。

specificity = TN/(TN + FP). 如果是非拖欠者，预测到是非拖欠者的概率。


可以降低阀值，提高预测为拖欠者的人数。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Table4_5.png}
\end{center}

\begin{center}
\includegraphics[width=0.70\textwidth]{ISLR_Fig4_8.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Table4_6.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Table4_7.png}
\end{center}

1 - specificity = 1 - TN/(TN + FP) = FP/(TN + FP) = false positive rate

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{QDA}

Quadratic Discriminant Analysis

每个类有不同的协方差矩阵(Covariance Matrix).  $X \sim N(\mu_k, \Sigma_k)$.

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq4_23.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig4_9.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Logistic regression, LDA, QDA与KNN}

没有一种方法在所有情况下最好。
 
%Gaussian假设

%KNN does not tell us which predictors are important.

%QDA can perform better in the presence of a limited number of training
%observations because it does make some assumptions about the form of
%the decision boundary.

\newpage
\begin{center}
\includegraphics[width=0.75\textwidth]{ISLR_Fig4_11.png}
\end{center}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Cross-validation}

在未见测试数据的情况下，从训练数据中抽出一部分，当作测试数据来作\underline{模型评价}与\underline{模型选择}。

LOO CV, k-fold CV ($k=5,10, 20$).


\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_1.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_2.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_3.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig3_8.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_4.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_6.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{k-fold中的k越大越大越好吗?}

Bias-Variance Trade-Off on  test error estimate.

思考题：如果$k=n$，那么每次训练样本都很相似，所得的模型也就很相似，进而所得的测试错误$E_i$的\underline{相关性}大。

$Var(E_1 + E_2) = Var(E_1) + Var(E_2) + 2 \mathbf{Cov(E_1, E_2)}$.

$\displaystyle Var(\bar{E}) = \frac{\sigma^2}{n} + \frac{n-1}{n} \rho \sigma^2$

https://en.wikipedia.org/wiki/Variance


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{k-fold CV用于分类情况}

\begin{center}
\includegraphics[width=0.65\textwidth]{ISLR_Fig5_7.png}
\end{center}

\begin{center}
\includegraphics[width=0.90\textwidth]{ISLR_Fig5_8.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{The Bootstrap (自举法)}

从样本中取样(sample with replacement)，不是从population中取样。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_11.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq5_7.png}
\end{center}

\newpage
\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq5_8.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig5_10.png}
\end{center}


%$Var(\alpha X + (1-\alpha) Y)$，找到$\alpha$使得前式最大。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{预测变量子集选取}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Algorithm_6_1.png}
\end{center}

存在问题：RSS或$R^2$总是随加入的变量增多而变好。为什么？

解决方法(1)$C_p$,AIC,BIC,调整后的$R^2$. (2)Validation set approach. (3) CV.

$\hat{\sigma}^2$是$\epsilon$的方差。$Y=\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p + \epsilon$.

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq_6_2.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq_AIC.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq_6_3.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq_6_4.png}
\end{center}

Figure 6.2

Figure 6.3


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{收缩法(shrinkage method)}

通过收缩预测变量前的系数（到0），达到选取预测变量子集的效果。

两种方法：ridge regression与lasso。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{Ridge regression}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq6_5.png}
\end{center}

$\hat{\beta}  = (X^T X + \lambda I)^{-1} X^T y$.

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig6_4.png}
\end{center}

优势在于能从bias-variance trade-off获得较小值。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig6_5.png}
\end{center}

当n（数据点数）不是远大于p（预测变量数）时，线性回归的Variance会很大。
所以，ridge回归在这种情况下效果最好。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{The Lasso}

优于ridge regression之处：更适合作预测变量选择。

当$\lambda$足够大，有些变量前面的系数会变成零。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq6_7.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig6_6.png}
\end{center}


\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig6_7.png}
\end{center}


\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq6_8_and_Eq6_9.png}
\end{center}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\foilhead{决定树(Decision Trees)}

Figure 8.1, 8.4, 8.5。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig8_1.png}
\end{center}

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig8_4.png}
\end{center}


用递归二分法(recursive binary splitting)产生树。每次找到一个变量$Xj$与一个分割点$s$，使得RSS最小。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq8_2_8_3.png}
\end{center}


树容易过度拟合(overfit)训练数据。所以要修剪树(tree pruning)，使得树变简单。Cost complexity pruning。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Eq8_4.png}
\end{center}

$|T|$是树的叶子个数。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig8_5.png}
\end{center}

也可以做分类树。每个叶子的数据点的类尽量一致。

Bagging。Bootstrap aggregation。目标:减少Variance。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_fbag.png}
\end{center}

$f^{*b}$从第$b$个自举样本中得到。

Random Forests. 从$p$个预测变量中选$m$个来建立树。

\begin{center}
\includegraphics[width=0.85\textwidth]{ISLR_Fig8_8.png}
\end{center}


% Skipped: Boosting.   Each tree is fit on a modified version of the original data set


\end{document}