-
Notifications
You must be signed in to change notification settings - Fork 1
/
test-tufte.tex
16535 lines (13382 loc) · 698 KB
/
test-tufte.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
letterpaper,
DIV=11,
numbers=noendperiod,
oneside]{scrartcl}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math}
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage[left=1in,marginparwidth=2.0666666666667in,textwidth=4.1333333333333in,marginparsep=0.3in]{geometry}
\ifLuaTeX
\usepackage{luacolor}
\usepackage[soul]{lua-ul}
\else
\usepackage{soul}
\fi
\setlength{\emergencystretch}{3em} % prevent overfull lines
\setcounter{secnumdepth}{5}
% Make \paragraph and \subparagraph free-standing
\ifx\paragraph\undefined\else
\let\oldparagraph\paragraph
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
\fi
\ifx\subparagraph\undefined\else
\let\oldsubparagraph\subparagraph
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
\fi
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{241,243,245}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.40,0.45,0.13}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\BuiltInTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\ExtensionTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.28,0.35,0.67}{#1}}
\newcommand{\ImportTok}[1]{\textcolor[rgb]{0.00,0.46,0.62}{#1}}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\NormalTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.68,0.00,0.00}{#1}}
\newcommand{\RegionMarkerTok}[1]{\textcolor[rgb]{0.00,0.23,0.31}{#1}}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{#1}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.07,0.07,0.07}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.13,0.47,0.30}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.37,0.37,0.37}{\textit{#1}}}
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}\usepackage{longtable,booktabs,array}
\usepackage{calc} % for calculating minipage widths
% Correct order of tables after \paragraph or \subparagraph
\usepackage{etoolbox}
\makeatletter
\patchcmd\longtable{\par}{\if@noskipsec\mbox{}\fi\par}{}{}
\makeatother
% Allow footnotes in longtable head/foot
\IfFileExists{footnotehyper.sty}{\usepackage{footnotehyper}}{\usepackage{footnote}}
\makesavenoteenv{longtable}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\KOMAoption{captions}{tableheading}
\makeatletter
\@ifpackageloaded{tcolorbox}{}{\usepackage[skins,breakable]{tcolorbox}}
\@ifpackageloaded{fontawesome5}{}{\usepackage{fontawesome5}}
\definecolor{quarto-callout-color}{HTML}{909090}
\definecolor{quarto-callout-note-color}{HTML}{0758E5}
\definecolor{quarto-callout-important-color}{HTML}{CC1914}
\definecolor{quarto-callout-warning-color}{HTML}{EB9113}
\definecolor{quarto-callout-tip-color}{HTML}{00A047}
\definecolor{quarto-callout-caution-color}{HTML}{FC5300}
\definecolor{quarto-callout-color-frame}{HTML}{acacac}
\definecolor{quarto-callout-note-color-frame}{HTML}{4582ec}
\definecolor{quarto-callout-important-color-frame}{HTML}{d9534f}
\definecolor{quarto-callout-warning-color-frame}{HTML}{f0ad4e}
\definecolor{quarto-callout-tip-color-frame}{HTML}{02b875}
\definecolor{quarto-callout-caution-color-frame}{HTML}{fd7e14}
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\AtBeginDocument{%
\ifdefined\contentsname
\renewcommand*\contentsname{Table of contents}
\else
\newcommand\contentsname{Table of contents}
\fi
\ifdefined\listfigurename
\renewcommand*\listfigurename{List of Figures}
\else
\newcommand\listfigurename{List of Figures}
\fi
\ifdefined\listtablename
\renewcommand*\listtablename{List of Tables}
\else
\newcommand\listtablename{List of Tables}
\fi
\ifdefined\figurename
\renewcommand*\figurename{Figure}
\else
\newcommand\figurename{Figure}
\fi
\ifdefined\tablename
\renewcommand*\tablename{Table}
\else
\newcommand\tablename{Table}
\fi
}
\@ifpackageloaded{float}{}{\usepackage{float}}
\floatstyle{ruled}
\@ifundefined{c@chapter}{\newfloat{codelisting}{h}{lop}}{\newfloat{codelisting}{h}{lop}[chapter]}
\floatname{codelisting}{Listing}
\newcommand*\listoflistings{\listof{codelisting}{List of Listings}}
\makeatother
\makeatletter
\makeatother
\makeatletter
\@ifpackageloaded{caption}{}{\usepackage{caption}}
\@ifpackageloaded{subcaption}{}{\usepackage{subcaption}}
\makeatother
\makeatletter
\@ifpackageloaded{sidenotes}{}{\usepackage{sidenotes}}
\@ifpackageloaded{marginnote}{}{\usepackage{marginnote}}
\makeatother
\makeatletter
\@ifpackageloaded{tikz}{}{\usepackage{tikz}}
\makeatother
\newcommand*\circled[1]{\tikz[baseline=(char.base)]{
\node[shape=circle,draw,inner sep=1pt] (char) {{\scriptsize#1}};}}
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same} % disable monospaced font for URLs
\hypersetup{
pdftitle={Lessons in Statistical Thinking},
pdfauthor={Daniel Kaplan},
colorlinks=true,
linkcolor={blue},
filecolor={Maroon},
citecolor={Blue},
urlcolor={Blue},
pdfcreator={LaTeX via pandoc}}
\title{Lessons in Statistical Thinking}
\usepackage{etoolbox}
\makeatletter
\providecommand{\subtitle}[1]{% add subtitle to \maketitle
\apptocmd{\@title}{\par {\large #1 \par}}{}{}
}
\makeatother
\subtitle{Intro Stats for the 21\textsuperscript{st} Century}
\author{Daniel T. Kaplan}
\date{2024-03-23}
\begin{document}
\maketitle
\renewcommand*\contentsname{Table of contents}
{
\hypersetup{linkcolor=}
\setcounter{tocdepth}{3}
\tableofcontents
}
\newpage
\section{Data frames}\label{sec-data-frames}
The origin of recorded history is, literally, data. Five-thousand years
ago, in Mesopotamia, the climate was changing. Retreating sources of
irrigation water called for an organized and coordinated response,
beyond the scope of isolated clans of farmers. To provide this response,
a new social structure -- government -- was established and grew. Taxes
were owed and paid, each transaction recorded. Food grain had to be
measured and stored, livestock counted, trades and shipments
memorialized.
Writing emerged as the technological innovation to keep track of all
this. We know this today because memoranda were incised by stylus on
soft clay tablets and baked into permanence. When the records were no
longer needed, they were recycled as building materials for the growing
settlements and cities. Archaeologists started uncovering these tablets
more than 100 years ago, spending decades to decipher the meaning of the
stylus marks in clay.
The writing and record-keeping technology developed over time: knots in
string, wax tablets, papyrus, vellum, paper, and computer memory. Making
sense of the records has always required \emph{literacy}, deciphering
marks according to the system and language used to represent the
writer's intent. Today, in many societies, the vast majority of people
have been taught to read and write their native language according to
the accepted conventions.
Conventions of record keeping diverge from those of everyday language.
For instance, financial transaction records must be guarded against
error and fraud. Starting in the thirteenth century, financial
accountants adopted a practice---double-entry bookkeeping---that has no
counterpart in everyday language.
{\marginnote{\begin{footnotesize}``\href{https://en.wikipedia.org/wiki/Double-entry_bookkeeping}{Double-entry
bookkeeping},'' records \emph{twice} in two different places, in the
form of a credit to an account and a debit from another
account.\end{footnotesize}}}
Modern conventions make working with data more accessible and more
reliable. Of primary interest to us in these \emph{Lessons} is the
organization provided by a ``\textbf{data frame},'' a structure for
holding data as exemplified in Figure~\ref{fig-data-frame-schematic}.
\begin{figure}
\sidecaption{\label{fig-data-frame-schematic}A data frame organizes
observed facts into rows and columns. Each column is a
\textbf{variable}. Each row is a \textbf{specimen}. Here, there are four
variables and five specimens. --- The display in
Figure~\ref{fig-data-frame-schematic} shows a small part of a larger
data frame holding observations collected by statistician Francis Galton
in the 1880s. I will use this data frame repeatedly across these lessons
because of the outsized historical role the data played in the
development of statistical methodology. The context for the data
collection was Galton's attempt to quantify the heritability of
biological traits. The particular trait of interest to Galton (probably
because it is easily measured) is human stature. Galton recorded the
heights of full-grown children and their parents.}
\centering{
\includegraphics[width=8.05in,height=\textheight]{www/data-frame-schematic.png}
}
\end{figure}%
The row-and-column organization of a data frame is reminiscent of a
spreadsheet. However, data frames have additional organizational
requirements that typical spreadsheet software does not enforce. The
term ``\textbf{tidy data}'' emphasizes that these requirements are being
met.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
Each variable must consist of the same kind of individual entries. For
example, the \texttt{mother} variable consists of numbers: a quantity.
In this case, the quantity is the mother's height in inches. It would
not be legitimate for an entry in \texttt{mother} to be a word or to
be a height in meters or something else entirely, for instance, a
blood pressure.
\item
Each row represents an individual real-world entity. For the data
frame shown in Figure~\ref{fig-data-frame-schematic}, each row
corresponds to an individual, fully-grown child. We use the term
``\textbf{unit of observation}'' to refer to the \emph{kind of entity}
represented in each row. All rows in a data frame must be the same
kind of unit of observation. It would not be legitimate for some rows
to individual people while others refer to something different such as
a house or family or country. If you wanted to record data on
families, you would need to create a new data frame where the unit of
observation is a family.
\end{enumerate}
We use the word ``\textbf{specimen}'' to refer to an individual instance
of the unit of observation. A data frame is a collection of specimens.
Each row represents a unique specimen.
The unit of observation in Figure~\ref{fig-data-frame-schematic} is a
full-grown child. The fifth row in that data frame refers to a unique
young woman in London in the 1880s (whose name is lost to history). By
using the word ``specimen'' to refer to this woman, we do not mean to
dehumanize her. However, we need a phrase that can be applied to a
single row of any data frame, whatever its unit of observation might be:
a shipping container, a blood sample, a day of ticket sales, and so on.
The collection of specimens comprised by a data frame is often a
``\textbf{sample}'' from a larger group of the units of observation.
Galton did not measure the height of every fully-grown child in London,
England, the UK, or the World. He collected a \emph{sample} from London
families. Sometimes, a data frame includes every possible instance of
the unit of observation. For example, a library catalog lists
comprehensively the books in a library. Such a comprehensive collection
is called a ``\textbf{census}.''
\begin{tcolorbox}[enhanced jigsaw, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, breakable, opacitybacktitle=0.6, colback=white, coltitle=black, arc=.35mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Example: New-born babies}, left=2mm, colframe=quarto-callout-note-color-frame, rightrule=.15mm, bottomrule=.15mm, leftrule=.75mm, bottomtitle=1mm, toptitle=1mm, titlerule=0mm, toprule=.15mm]
The US Centers for Disease Control (CDC) publishes a ``public use file''
each year, a data frame where the unit of observation is an infant born
in the US. (The many variables include the baby's weight and sex, the
mother's age, and the number of prenatal care visits during the
pregnancy.) The published file for 2022 contains 3,699,040 rows; that is
the number of (known) births in 2022. As such, the CDC data constitutes
a \textbf{census} rather than a \textbf{sample}.
\end{tcolorbox}
\subsection{Types of variables}\label{types-of-variables}
Each column of a data frame is a variable. The word ``variable'' is
appropriate because the entries within a variable \textbf{vary} one from
one row to another. Other words with the same root include
``variation,'' ``variety,'' and even ``diversity.''
Data-frame variables come in two fundamental types:
{\marginnote{\begin{footnotesize}The distinction between quantitative
and categorical variables is fundamental to statistical work. You should
be able to discern whether a variable is categorical or quantitative
from a glance at a data frame.\end{footnotesize}}}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
\textbf{Quantitative} variables record an ``amount'' of something.
These might just as well be called ``numerical'' variables.
\item
\textbf{Categorical} variables typically consist of letters. For
instance, the \texttt{sex} variable in
Figure~\ref{fig-data-frame-schematic} contains entries that are either
\textbf{F} or \textbf{M}. In most of the data we work with in these
\emph{Lessons}, there is a fixed set of entry values called the
\textbf{levels} of the categorical variable. The levels of
\texttt{sex} are \textbf{F} and \textbf{M}.
\end{enumerate}
{\marginnote{\begin{footnotesize}We are not doing full justice to the
variety of possible variable types by focusing on just two type:
quantitative and categorical. You should be aware that there are other
kinds, for example, photographs or dates.\end{footnotesize}}}
\begin{tcolorbox}[enhanced jigsaw, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, breakable, opacitybacktitle=0.6, colback=white, coltitle=black, arc=.35mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Example (cont.): The CDC births data frame}, left=2mm, colframe=quarto-callout-note-color-frame, rightrule=.15mm, bottomrule=.15mm, leftrule=.75mm, bottomtitle=1mm, toptitle=1mm, titlerule=0mm, toprule=.15mm]
Among the many variables in the CDC public use file of births are
\texttt{place} and \texttt{diabetes\_gest}, which record the place of
birth and whether the mother developed gestational diabetes.
The \texttt{place} variable is categorical, with these levels:
\begin{itemize}
\tightlist
\item
``hospital''
\item
``home (intended)''
\item
``home (unintended)''
\item
``freestanding''
\item
``other''
\end{itemize}
The \texttt{diabetes\_gest} variable has two levels: \textbf{N} or
\textbf{Y}.
\end{tcolorbox}
\subsection{The codebook}\label{sec-codebook}
How are you to know for any given data frame what constitutes the unit
of observation or what each variable is about? This information,
sometimes called \textbf{metadata}, is stored outside the data frame.
Often, the metadata is contained in a separate documentation file called
a ``\textbf{codebook}.''
To start, the codebook should make clear what is the unit of observation
for the data frame. For instance, we described the unit of observation
for the data frame shown in Figure~\ref{fig-data-frame-schematic} as a
fully grown child. This detail is important. For instance, each such
child---each specimen---can appear only once in the data frame. In
contrast, the same \texttt{mother} and \texttt{father} might appear for
multiple specimens, namely, the siblings of the child.
In the CDC data frame, the unit of observation is a newborn baby. If a
birth resulted in twins, each of the two babies will have its own row.
In contrast, imagine a data frame for the birth mothers or another for
prenatal care visits. Each mother could appear only once in the
birth-mothers frame, but the same mother can appear multiple times in
the prenatal care data frame.
For quantitative variables, the relevant metadata includes what the
number refers to (e.g., mother's height or baby's weight) and the
physical units of that quantity (e.g., inches for height or grams for
weight).
For categorical variables, the metadata should describe the meaning of
each level in as much detail as necessary.
\begin{tcolorbox}[enhanced jigsaw, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, breakable, opacitybacktitle=0.6, colback=white, coltitle=black, arc=.35mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Example (cont.): CDC births codebook}, left=2mm, colframe=quarto-callout-note-color-frame, rightrule=.15mm, bottomrule=.15mm, leftrule=.75mm, bottomtitle=1mm, toptitle=1mm, titlerule=0mm, toprule=.15mm]
The codebook for the CDC data is a PDF document entitled ``User Guide to
the 2022 Natality Public Use File.'' You can access it on the
\href{https://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/DVS/natality/UserGuide2022.pdf}{CDC
website}.
\end{tcolorbox}
\subsection{Accessing data frames}\label{accessing-data-frames}
Most statistics software, including R, makes it easy to access data
frames stored as files in any of a variety of formats. (For examples,
see Exercise 1.18.)
Almost all the data frames used as examples or exercises in these
\emph{Lessons} are stored in files provided by R software
``\textbf{packages}'' such as \texttt{\{LSTbook\}} or
\texttt{\{mosaicData\}}. The data frame itself is easily accessed by a
simple name, e.g., \texttt{Galton}. The location of the data frame is
specified by the package name as a prefix followed by a pair of colons,
e.g.~\texttt{mosaicData::Galton}. A convenient feature of this system is
the easy access to documentation by giving a command consisting of a
question mark followed by the
\emph{package-name}::\emph{data-frame-name}, e.g.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{?mosaicData}\SpecialCharTok{::}\NormalTok{Galton}
\end{Highlighting}
\end{Shaded}
\subsection{Computing with data frames}\label{sec-computing-data-frames}
Lessons \ref{sec-point_plots}, \ref{sec-variation-and-distribution} \&
\ref{sec-model-annotation} cover how to make informative graphics that
give an overview of the contents in a data frame. Lesson
\ref{sec-wrangling} introduces commands for manipulating the contents of
a data frame to put them in a more useful form for the data graphics or
data summary task at hand.
This Lesson shows you how to access data frames and their documentation
and how to perform simple tasks such as listing the variable names or
glimpsing a few rows of a data frame.
There are many software systems for working with data frames. Commonly
available spreadsheet software, while suited to some data-entry and
data-summarizing tasks, is surprisingly limited when it comes to
statistical thinking. The system we will use, RStudio, is one of a
handful used by data science professionals. It's available free both as
an online, browser-based platform and for installation on a laptop
computer or computer server.
Much of the statistical work you do in RStudio consists of writing
commands in the R language. The word ``language'' is offputting to many
people, associating it as they do with natural languages such as Chinese
or Spanish, mastery of which takes time and much work. Fortunately, you
do not have to learn the R language; you need only a couple dozen R
expressions to work through all these \emph{Lessons}.
\marginnote{\begin{footnotesize}
If you are on your own, the instructions below provide a quick way to
get started with minimal effort.
If you are a student using these Lessons as part of a class, check with
your instructor who may already have set up a way for you to access
RStudio.
\end{footnotesize}}
We continue here under the assumption that you have already been shown
how to install and access RStudio by an instructor or other mentor. That
person will have arranged to install some additional software written
for these \emph{Lessons}, particularly the \texttt{\{LSTbook\}} package.
Each time you open RStudio, load the \texttt{\{LSTbook\}} package using
this \textbf{command} at the R prompt in the ``console'' tab.
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(LSTbook)}
\end{Highlighting}
\end{Shaded}
\begin{tcolorbox}[enhanced jigsaw, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, breakable, opacitybacktitle=0.6, colback=white, coltitle=black, arc=.35mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Starting out with R via \texttt{posit.cloud}}, left=2mm, colframe=quarto-callout-note-color-frame, rightrule=.15mm, bottomrule=.15mm, leftrule=.75mm, bottomtitle=1mm, toptitle=1mm, titlerule=0mm, toprule=.15mm]
Note: Otherwise \ldots{}
\texttt{posit.cloud} is a ``freemium'' web service. The word
``freemium'' signals that you can use it for free, up to a point.
Fortunately, that point will suffice for you to follow all of these
\emph{Lessons}.
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
In your browser, follow
\href{https://posit.cloud/content/6532153}{this link}. This will take
you to \texttt{posit.cloud} and, after asking you to login via Google
or to set up an account, will bring you to a page that will look much
like the following. (It may take a few minutes.)
\end{enumerate}
\includegraphics{www/posit-cloud.png}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\item
On the left half of the window, there are three ``tabs'' labelled
``Console,'' ``Terminal,'' and ``Background Jobs.'' You will be
working in the ``Console'' tab. Click in that tab and you will see a
flashing \texttt{\textbar{}} cursor after the \texttt{\textgreater{}}
sign.
\item
Give this command, exactly as written, and press return: ::: \{.cell\}
\end{enumerate}
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(LSTbook)}
\end{Highlighting}
\end{Shaded}
\end{tcolorbox}
Now you are ready to go. :::
All of your work with R will consist of giving commands at the
\texttt{\textgreater{}} prompt and pressing return. Possibly the
simplest of all commands is merely the name of a data frame. For
instance, the \texttt{\{LSTbook\}} package provides, among many others,
a data frame named \texttt{AAUP}. Try this as a command:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{AAUP}
\end{Highlighting}
\end{Shaded}
The result of such a command will be a print-out of the first several
rows and columns of the data frame. Some of the data frames provided by
\texttt{\{LSTbook\}} have a couple of dozen rows, others have tens of
thousands. Printing out the first few rows of a data frame is useful
since it shows the variable names and you can see whether each variable
is quantitative or categorical.
To see the codebook for a data frame, simply precede the name with the
\texttt{?} character, for instance:
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{?Births2022}
\end{Highlighting}
\end{Shaded}
\begin{marginfigure}
\centering{
\includegraphics{www/births-documentation.png}
}
\caption{\label{fig-cdc-codebook}The codebook for the CDC births data
frame can be accessed with \texttt{?Births2022}. When displayed in the
RStudio Help tab, you can scroll through the descriptions of all 38
variables.}
\end{marginfigure}%
RStudio arranges for the codebook to be displayed in the ``Help'' tab.
This allows you to scroll through the documentation, follow web links
(if any), and keep the names of the variables displayed in the Help tab
while you write commands in the Console tab.
Commands you will use in these \emph{Lessons} will often start with the
name of a data frame followed a ``\textbf{pipeline} symbol
\texttt{\textbar{}\textgreater{}} which is then followed by a
description of the action you want to perform. Let's consider two simple
actions:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
Count the rows in the data frame:
\end{enumerate}
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{AAUP }\SpecialCharTok{|\textgreater{}} \FunctionTok{nrow}\NormalTok{()}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
[1] 28
\end{verbatim}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\setcounter{enumi}{1}
\tightlist
\item
List the names of the variables.
\end{enumerate}
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{AAUP }\SpecialCharTok{|\textgreater{}} \FunctionTok{names}\NormalTok{()}
\end{Highlighting}
\end{Shaded}
\begin{verbatim}
[1] "subject" "acsal" "fem" "unemp" "nonac" "nonacsal" "licensed"
\end{verbatim}
These two commands have a similar structure involving four elements.
\includegraphics{www/latex-image-pipe.png}
There are two names in this command: the name of a data frame and a
``\textbf{function}'' name. The function name is how you specify what
you want to calculate from the data frame.
There are also two bits of punctuation:
\begin{itemize}
\item
the pipeline symbol \texttt{\textbar{}\textgreater{}}, which connects
the data frame to the function.
\item
a pair of open and close parentheses immediately following the
function name. Every time you use a function the function name will be
followed by parentheses.
\end{itemize}
\begin{tcolorbox}[enhanced jigsaw, colbacktitle=quarto-callout-note-color!10!white, opacityback=0, breakable, opacitybacktitle=0.6, colback=white, coltitle=black, arc=.35mm, title=\textcolor{quarto-callout-note-color}{\faInfo}\hspace{0.5em}{Tables versus data frames}, left=2mm, colframe=quarto-callout-note-color-frame, rightrule=.15mm, bottomrule=.15mm, leftrule=.75mm, bottomtitle=1mm, toptitle=1mm, titlerule=0mm, toprule=.15mm]
You may notice that the displays of data frames printed in this book are
given labels such as Table~\ref{tbl-galton-dataframe}. It is natural to
wonder why the word ``table'' is used sometimes and ``data frame'' other
times.
In these \emph{Lessons} we make the following distinction. A ``data
frame'' stores values in the strict format of rows and columns described
previously. Data frames are ``machine readable.''
The data scientist working with data frames often seeks to create a
\textbf{display} intended for human eyes. A ``table'' is one kind of
\textbf{display} for humans. Since humans have common sense and have
learned many ways to communicate with other humans, a table does not
have to follow the restrictions placed on data frames. Tables are not
necessarily organized in strict row-column format, can include units for
numerical quantities and comments. An example is the table put together
by Francis Galton (Figure~\ref{fig-galton-notebook}) to organize his
measurements of heights.
\begin{figure}[H]
\sidecaption{\label{fig-galton-notebook}An excerpt from Francis Galton's
notebook recording the heights of parents and children in London in the
1880s.}
\centering{
\includegraphics[width=3.92in,height=\textheight]{www/galton-notebook-excerpt.png}
}
\end{figure}%
We make the distinction between a data frame (for data storage) and a
table (for communicating with humans) because many of the operations
discussed in later lessons serve the purpose of transforming data frames
into human-facing displays such as graphics (Lesson
\ref{sec-point_plots}) or tables (Section~\ref{sec-displaying-tables}.)
Often, a literal display of a data frame may seem inefficient, for
instance this view of the \texttt{Galton} dataframe which was
constructed from Figure~\ref{fig-galton-notebook}.
\begin{Shaded}
\begin{Highlighting}[]
\NormalTok{Galton}
\end{Highlighting}
\end{Shaded}
\begin{table}[H]
\caption{\label{tbl-galton-dataframe}The records from the table shown in
Figure~\ref{fig-galton-notebook} in a data-frame format.}
\centering{
\begin{verbatim}
| family | father | mother | sex | height | nkids |
|:------:|:------:|:------:|:---:|:------:|:-----:|
| 1 | 78.5 | 67 | M | 73.2 | 4 |
| 1 | 78.5 | 67 | F | 69.2 | 4 |
| 1 | 78.5 | 67 | F | 69 | 4 |
| 1 | 78.5 | 67 | F | 69 | 4 |
| 2 | 75.5 | 66.5 | M | 73.5 | 4 |
| 2 | 75.5 | 66.5 | M | 72.5 | 4 |
| 2 | 75.5 | 66.5 | F | 65.5 | 4 |
| 2 | 75.5 | 66.5 | F | 65.5 | 4 |
| 3 | 75 | 64 | M | 71 | 2 |
| 3 | 75 | 64 | F | 68 | 2 |
\end{verbatim}
}
\end{table}%
It may seem that the data frame is inefficient, for example repeating
the heights of mother and father for all the siblings in a family. But
this view of efficiency relates to the use of paper and ink by a table;
the computer entity requires a different view of efficiency.
\end{tcolorbox}
\newpage
\section{Data graphics}\label{sec-point_plots}
The statistical thinker seeks to identify patterns in data, such as
possible relationships between variables. Translating a data frame into
graphical form---data graphics---is an important tool for revealing or
suggesting patterns.
\begin{figure}
\centering{
\includegraphics[width=6.56in,height=\textheight]{www/playfair-aligned.png}
}
\caption{\label{fig-playfair}William Playfair's 1801 presentation of
year-by-year data on trade between England and the East Indies.
\href{https://colenda.library.upenn.edu/catalog/81431-p3bv7bb8v}{Source:
University of Pennsylvania Libraries}}
\end{figure}%
\begin{marginfigure}
\caption{\label{tbl-playfair-trade}Annual exports and imports in the
trade between England and the East Indies}
\centering{
\begin{tabular}{r|r|r}
\hline
Year & Exports & Imports\\
\hline
1700 & 180 & 460\\
\hline
1701 & 170 & 480\\
\hline
1702 & 160 & 490\\
\hline
1703 & 150 & 500\\
\hline
1704 & 145 & 510\\
\hline
1705 & 140 & 525\\
\hline
1706 & 135 & 550\\
\hline
1707 & 125 & 565\\
\hline
1708 & 120 & 580\\
\hline
\end{tabular}
\emph{\ldots{} and so on to year 1800.}
}
\end{marginfigure}%
Making pictures of data is a relatively modern idea.
\href{https://en.wikipedia.org/wiki/William_Playfair}{William Playfair}
(1759-1823) is credited as the inventor of novel graphical forms in
which data values are presented graphically rather than as numbers or
text. To illustrate, consider the data from the 1700s
(Table~\ref{tbl-playfair-trade}) that Playfair turned into a picture.
Playfair's innovation, as in Figure~\ref{fig-playfair}, was successful
because it was powerful. A pattern that may be obscure in the data frame
becomes visually apparent to the human viewer. For example, consider the
graphic in Figure~\ref{fig-playfair} displaying data on trade between
England and the East Indies in the 1700s. The graphic lets you look up
the amount of trade each year, but it also shows patterns, such as the
upward \emph{trend} across the decades.
Data graphics can also make it easy to see deviations from trends, for
instance, the dip in exports and flattening of imports during 1775-1780.
Students often encounter various types of data graphics as they progress
through elementary and high school. Figure~\ref{fig-textbook-graphs}
shows a few examples commonly found in textbooks. Remarkably, it's rare
to encounter such textbook graphic types outside of a statistics course.
\begin{figure*}
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-hist2.png}
\subcaption{\label{}Histogram}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-dotplot.png}
\subcaption{\label{}Dot plot}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-barchart.png}
\subcaption{\label{}Bar chart}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-pie.png}
\subcaption{\label{}Pie chart}
\end{minipage}%
\newline
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-boxplot.png}
\subcaption{\label{}Boxplot}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/playfair-pie-chart.png}
\subcaption{\label{}Playfair's pie chart}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/playfair-barchart.png}
\subcaption{\label{}Playfairs bar chart}
\end{minipage}%
%
\begin{minipage}{0.25\linewidth}
\includegraphics{www/thumbnail-stem-and-leaf.png}
\subcaption{\label{}Stem-and-leaf plot}
\end{minipage}%
\caption{\label{fig-textbook-graphs}Some of the graphics styles often
featured in statistics textbooks.}
\end{figure*}%
Modern data graphic designers are introducing even more variety; their
graphics can be captivating, colorful, dynamic, and informative. Some
online examples:
\href{https://flowingdata.com/2015/12/15/a-day-in-the-life-of-americans}{how
people spend their day},
\href{https://flowingdata.com/2017/01/24/one-dataset-visualized-25-ways}{life
expectancy}, \href{http://hint.fm/wind/}{wind patterns (right now!)},
\href{https://archive.nytimes.com/www.nytimes.com/imagepages/2011/11/06/opinion/06atrocities_timeline.html?action=click&contentCollection=Opinion&module=RelatedCoverage\%25C2\%25AEion=EndOfArticle&pgtype=article}{historical
sources of death}. The graphical types in
Figure~\ref{fig-textbook-graphs} were all invented long before computers
became available to help us work with data.
We won't use such graphical variety in these \emph{Lessons}.
{\marginnote{\begin{footnotesize}\href{Streamlining_graphics}{Note to
instructors}\end{footnotesize}}} Instead, we will use a single basic
form of graphic---the ``\textbf{annotated point plot}''---capable of
displaying multiple variables simultaneously and which can combine into
one view both the raw data and a summary of the patterns found in the
data.
\subsection{Point plot}\label{point-plot}
A \textbf{point plot} contains a simple mark---a dot---for each row of a
data frame. In its most common form, a point plot displays two selected
variables from the data frame. One variable is depicted as the vertical
coordinate, and the other as the horizontal coordinate.
{\marginnote{\begin{footnotesize}A ``point plot'' is also known as a
``scatter plot.''\end{footnotesize}}}
To illustrate how a point plot relates to the underlying data frame,
consider Table~\ref{tbl-world-cities}, where the unit of observation is
a city. (The data frame is available in R as
\texttt{maps::world.cities}.)
\begin{table}
\caption{\label{tbl-world-cities}Basic data on cities in the
\texttt{maps::world.cities} data frame}
\centering{
\begin{tabular}{l|l|r|r|r|r}
\hline
name & country.etc & pop & lat & long & capital\\
\hline
Shanghai & China & 15017783 & 31.23 & 121.47 & 2\\
\hline
Bombay & India & 12883645 & 18.96 & 72.82 & 0\\
\hline
Karachi & Pakistan & 11969284 & 24.86 & 67.01 & 0\\
\hline
Buenos Aires & Argentina & 11595183 & -34.61 & -58.37 & 1\\
\hline
Delhi & India & 11215130 & 28.67 & 77.21 & 0\\
\hline
Manila & Philippines & 10546511 & 14.62 & 120.97 & 1\\
\hline
\end{tabular}