# Libraries
library("readxl")
library("tidyverse")
# Load data from working directory
<- read_xlsx("../datasets/Paquot_Larsson_2020_data.xlsx") cl.order
4.4 Continuous data
Suggested reading
Heumann et al. (2022: Chapter 3)
Preparation
We will use the dataset from the previous unit:
Measures of central tendency
From here on out, we assume \(X\) is a continuous random variable with observations \(\{x_1, x_2, ..., x_n\}\) and sample size \(n\). Measures of central tendency offer convenient one-value-summaries of the distribution of \(X\).
The sample mean
The sample mean \(\bar{x}\) is defined as
\[ \bar{x} = \frac{x_1 + x_2 + ... + x_n}{n} \\ = \frac{1}{n}\sum_{i=1}^n{x_i}. \tag{1}\]
In R, we can obtain the average value of a numeric vector with the mean()
function. Let’s do that for the length of main clauses found in the cl.order
data:
mean(cl.order$LEN_MC)
[1] 9.265509
The output returned by this function provides a one-value summary of all observations contained in LEN_MC
. Because the the mean \(\bar{x}\) takes into account all data points, it is prone to the influence of outliers, i.e., extreme values.
The distribution of continuous variables is best visualised in terms of histograms or density plots, which are illustrated for LEN_MC
. The blue line indicates the sample mean.
# Plot distribution of LEN_MC
<- ggplot(cl.order, aes(x = LEN_MC)) +
cl.length.hist geom_histogram(binwidth = 2)
+
cl.length.hist # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)),
color = "steelblue",
linewidth = 1) +
theme_classic()
# Plot distribution of LEN_MC
<- ggplot(cl.order, aes(x = LEN_MC)) +
cl.length.dens geom_density()
+
cl.length.dens # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)),
color = "steelblue",
linewidth = 1) +
theme_classic()
hist(cl.order$LEN_MC)
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
plot(density(cl.order$LEN_MC))
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
The median
The median()
function computes the “the halfway point of the data (50% of the data are above the median; 50% of the data are below” (Winter 2020: 58). As such, it is the measure of choice for data with many outliers as well as for ordinal data (e.g. Likert-scale ratings).
\[ \tilde{x}_{0.5} = \begin{cases} x_{((n+1)/2)} & \text{if } n \text{ is odd.} \\ \frac{1}{2}(x_{n/2}+x_{(n/2+1)}) & \text{if } n \text{ is even.} \end{cases} \tag{2}\]
median(cl.order$LEN_MC)
[1] 8
The median of LEN_MC
is represented by the red vertical line.
+
cl.length.hist # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add median
geom_vline(aes(xintercept = median(LEN_MC)), color = "red", linewidth = 1) +
theme_classic()
+
cl.length.dens # Add mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add median
geom_vline(aes(xintercept = median(LEN_MC)), color = "red", linewidth = 1) +
theme_classic()
hist(cl.order$LEN_MC)
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
abline(v=median(cl.order$LEN_MC),lwd=3, col = "red")
plot(density(cl.order$LEN_MC))
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "steelblue")
abline(v=mean(cl.order$LEN_MC),lwd=3, col = "red")
Sample variance and standard deviation
In order to assess how well the mean represents the data, it is instructive to compute the variance var()
and the standard deviation sd()
for a sample.
The unbiased sample variance \(s^2\) is defined as
\[ s^2 = \frac{1}{n-1}\sum_{i=1}^n{(x_i - \bar{x})^2}. \tag{3}\]
In other words, it represents for the average squared deviation of all observations from the sample mean.
var(cl.order$LEN_MC)
[1] 25.12585
Correspondingly, the standard deviation of the mean is the square root of the variance:
\[ s = \sqrt{s^2} \tag{4}\]
sd(cl.order$LEN_MC)
[1] 5.012569
+
cl.length.hist # Add verticle line for the mean
geom_vline(aes(xintercept = mean(LEN_MC)), color = "steelblue", linewidth = 1) +
# Add -1sd
geom_vline(aes(xintercept = mean(LEN_MC) - sd(LEN_MC)), color = "orange", linewidth = 1) +
# Add +1sd
geom_vline(aes(xintercept = mean(LEN_MC) + sd(LEN_MC)), color = "orange", linewidth = 1) +
theme_classic()
# Create data frame with mean and sd for each clause ORDER
%>%
cl.order # Select variables of interest
select(ORDER, LEN_MC) %>%
# Group results of following operations by ORDER
group_by(ORDER) %>%
# Create grouped summary of mean and sd for each ORDER
summarise(mean = mean(LEN_MC),
sd = sd(LEN_MC)) -> cl_mean_sd; cl_mean_sd
# A tibble: 2 × 3
ORDER mean sd
<chr> <dbl> <dbl>
1 mc-sc 9.04 4.91
2 sc-mc 9.75 5.22
# Plot results
ggplot(cl_mean_sd, aes(x = ORDER, y = mean)) +
# Barplot with a specific variable mapped onto y-axis
geom_col() +
# Add mean and standard deviation to the plot
geom_errorbar(aes(x = ORDER,
ymin = mean-sd,
ymax = mean+sd), width = .2) +
theme_classic() +
labs(y = "Mean length of main clauses", x = "Clause order")
Quantiles
While median()
divides the data into two equal sets (i.e., two 50% quantiles), the quantile()
function makes it possible to partition the data further.
quantile(cl.order$LEN_MC)
0% 25% 50% 75% 100%
2 6 8 11 31
quantile(x, 0)
and quantile(x, 1)
thus show the minimum and maximum values, respectively.
quantile(cl.order$LEN_MC, 0)
0%
2
quantile(cl.order$LEN_MC, 1)
100%
31
Quartiles and boxplots
Consider the distribution of clause length by clause order:
boxplot(LEN_MC ~ ORDER, cl.order)
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_boxplot() +
theme_classic()
Compare it to the corresponding rotated density plot:
ggplot(cl.order, aes(x = LEN_MC, fill = ORDER)) +
geom_density(alpha = 0.5) +
coord_flip() +
theme_classic()
Visualising mixed data
A numerical and categorical variable
- Boxplot with
geom_boxplot()
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_boxplot()
- Densitiy plot using the optional arguments
color
and/orfill
ggplot(cl.order, aes(x = LEN_MC, fill = ORDER)) +
geom_density(alpha = 0.5)
- A barplot with
geom_col()
ggplot(cl.order, aes(x = ORDER, y = LEN_MC)) +
geom_col(aes(x = ORDER, y = LEN_MC))
Multivariate plots
- Advanced scatterplot with four variables:
LEN_MC
(x),LEN_SC
(y),ORDER
(colour) andSUBORDTYPE
(shape)
# 4 variables
ggplot(cl.order, aes(x = LEN_MC, y = LEN_SC)) +
geom_point(aes(color = ORDER, shape = SUBORDTYPE))
- Facets
# 5 variables
ggplot(cl.order, aes(x = LEN_MC, y = LEN_SC)) +
geom_point(aes(color = ORDER, shape = SUBORDTYPE)) +
facet_wrap(~MORETHAN2CL)