# Libraries
library("readxl")
library("tidyverse")
# Load data from working directory
<- read_xlsx("Paquot_Larsson_2020_data.xlsx") cl.order
16 Continuous data
16.1 Suggested reading
Heumann et al. (2022: Chapter 3)
16.2 Preparation
We will use the dataset from the previous unit:
16.3 Measures of central tendency
16.3.1 The mean
A useful summary statistic is the arithmetic mean \(\bar{x}\). Consider a continuous variable \(X\) with observations \(\{x_1, x_2, ..., x_n\}\) from a sample of size \(n\). The sample mean \(\bar{x}\) then corresponds to
\[ \bar{x} = \frac{x_1 + x_2 + ... + x_n}{n} \\ = \frac{1}{n}\sum_{i=1}^n{x_i}. \tag{16.1}\]
The summation symbol \(\sum\) (Greek ‘upper-case sigma’) is a concise way to express the sum of a sequence of numbers.
It works like this:
The expression below the \(\sum\) (e.g., \(i = 1\)) indicates the starting value of the index \(i\).
The number above the \(\sum\) (e.g., \(n\)) is the ending value of the index \(i\).
The expression to the right of the \(\sum\) (e.g., \(x_i\)) is the term to be summed.
For instance, the more general expression \(\sum_{i=1}^n{x_i}\) means “sum the values of \(x_i\) starting from \(i = 1\) up to \(i = n\).” In other words, it adds up the values \(x_1 + x_2 + \dots + x_n\).
In R, we can obtain the average value of a numeric vector with the mean()
function. Let’s do that for the length of main clauses found in the cl.order
data:
mean(cl.order$LEN_MC)
[1] 9.265509
The output returned by this function provides a one-value summary of all observations contained in LEN_MC
. Because the the mean \(\bar{x}\) takes into account all data points, it is prone to the influence of outliers, i.e., extreme values.
The distribution of continuous variables is best visualised in terms of histograms or density plots, which are illustrated for LEN_MC
. The blue line indicates the sample mean.
# Plot distribution of LEN_MC
<- ggplot(cl.order, aes(x = LEN_MC)) +
cl.length.hist geom_histogram(binwidth = 2)
+
cl.length.hist # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)),
color = "steelblue",
linewidth = 1) +
theme_classic()
# Plot distribution of LEN_MC
<- ggplot(cl.order, aes(x = LEN_MC)) +
cl.length.dens geom_density()
+
cl.length.dens # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)),
color = "steelblue",
linewidth = 1) +
theme_classic()
hist(cl.order$LEN_MC)
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
plot(density(cl.order$LEN_MC))
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
16.3.2 The median
The median()
function computes the “the halfway point of the data (50% of the data are above the median; 50% of the data are below” (Winter 2020: 58). As such, it is the measure of choice for data with many outliers as well as for ordinal data (e.g. Likert-scale ratings).
\[ \tilde{x}_{0.5} = \begin{cases} x_{((n+1)/2)} & \text{if } n \text{ is odd.} \\ \frac{1}{2}(x_{n/2}+x_{(n/2+1)}) & \text{if } n \text{ is even.} \end{cases} \tag{16.2}\]
median(cl.order$LEN_MC)
[1] 8
The median of LEN_MC
is represented by the red vertical line.
+
cl.length.hist # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add median
geom_vline(aes(xintercept = median(LEN_MC)), color = "red", linewidth = 1) +
theme_classic()
+
cl.length.dens # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add median
geom_vline(aes(xintercept = median(LEN_MC)), color = "red", linewidth = 1) +
theme_classic()
hist(cl.order$LEN_MC)
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
abline(v=median(cl.order$LEN_MC),lwd=3, col = "red")
plot(density(cl.order$LEN_MC))
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "red")
16.3.3 Sample variance and standard deviation
In order to assess how well the mean represents the data, it is instructive to compute the variance var()
and the standard deviation sd()
for a sample. The component \(\sum_{i=1}^n (x_i - \bar{x})^2\) is also known as the squared error.
The sample variance \(s^2\) is defined as
\[ s^2 = \frac{1}{n}\sum_{i=1}^n{(x_i - \bar{x})^2}. \tag{16.3}\]
In other words, it stands for the average squared deviation of all observations from the sample mean.
var(cl.order$LEN_MC)
[1] 25.12585
Correspondingly, the standard deviation of the mean is the square root of the variance:
\[ s = \sqrt{\frac{1}{n}\sum_{i=1}^n{(x_i - \bar{x})^2}} \tag{16.4}\]
sd(cl.order$LEN_MC)
[1] 5.012569
Visualisation:
+
cl.length.hist # Add verticle line for the mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add -1sd
geom_vline(aes(xintercept = mean(LEN_MC) - sd(LEN_MC)), color = "orange", linewidth = 1) +
# Add +1sd
geom_vline(aes(xintercept = mean(LEN_MC) + sd(LEN_MC)), color = "orange", linewidth = 1) +
theme_classic()
# Create data frame with mean and sd for each clause ORDER
%>%
cl.order # Select variables of interest
select(ORDER, LEN_MC) %>%
# Group results of following operations by ORDER
group_by(ORDER) %>%
# Create grouped summary of mean and sd for each ORDER
summarise(mean = mean(LEN_MC),
sd = sd(LEN_MC)) -> cl_mean_sd; cl_mean_sd
# A tibble: 2 × 3
ORDER mean sd
<chr> <dbl> <dbl>
1 mc-sc 9.04 4.91
2 sc-mc 9.75 5.22
# Plot results
ggplot(cl_mean_sd, aes(x = ORDER, y = mean)) +
# Barplot with a specific variable mapped onto y-axis
geom_col() +
# Add mean and standard deviation to the plot
geom_errorbar(aes(x = ORDER,
ymin = mean-sd,
ymax = mean+sd), width = .2) +
theme_classic() +
labs(y = "Mean length of main clauses", x = "Clause order")
16.3.4 Quantiles
While median()
divides the data into two equal sets (i.e., two 50% quantiles), the quantile()
function makes it possible to partition the data further.
quantile(cl.order$LEN_MC)
0% 25% 50% 75% 100%
2 6 8 11 31
quantile(x, 0)
and quantile(x, 1)
thus show the minimum and maximum values, respectively.
quantile(cl.order$LEN_MC, 0)
0%
2
quantile(cl.order$LEN_MC, 1)
100%
31
16.3.5 Quartiles and boxplots
Consider the distribution of clause length by clause order:
boxplot(LEN_MC ~ ORDER, cl.order)
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_boxplot() +
theme_classic()
Compare it to the corresponding rotated density plot:
ggplot(cl.order, aes(x = LEN_MC, fill = ORDER)) +
geom_density(alpha = 0.5) +
coord_flip() +
theme_classic()
16.4 Visualising mixed data
16.4.1 A numerical and categorical variable
- Boxplot with
geom_boxplot()
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_boxplot()
- Densitiy plot using the optional arguments
color
and/orfill
ggplot(cl.order, aes(x = LEN_MC, fill = ORDER)) +
geom_density(alpha = 0.5)
- A barplot with
geom_col()
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_col(aes(x = ORDER, y = LEN_MC))
16.4.2 Multivariate plots
- Advanced scatterplot with four variables:
LEN_MC
(x),LEN_SC
(y),ORDER
(colour) andSUBORDTYPE
(shape)
# 4 variables
ggplot(cl.order, aes(x = LEN_MC, y = LEN_SC)) +
geom_point(aes(color = ORDER, shape = SUBORDTYPE))
- Facets
# 5 variables
ggplot(cl.order, aes(x = LEN_MC, y = LEN_SC)) +
geom_point(aes(color = ORDER, shape = SUBORDTYPE)) +
facet_wrap(~MORETHAN2CL)