Introduction

This project analyzes the ‘Gapminder’ dataset on R to explore global trends in life expectancy and GDP.

Data Preparation

Load the libraries and data

library(gapminder)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(rmarkdown)

data <- gapminder

Data Summary

summary(data)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

Check for missing and duplicate values

sum(is.na(data))
## [1] 0
duplicates <- data %>% 
  group_by(country, year) %>% 
  summarise(n = n()) %>% 
  filter(n > 1)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
nrow(duplicates)
## [1] 0

Data structure

str(data)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

Year range

year_range <- range(data$year)
year_range
## [1] 1952 2007

Data Analysis

Worldwide metrics by year

data_stats <- data %>%
  group_by(year) %>%
  summarise(
    mean_lifeExp = mean(lifeExp),
    sd_lifeExp = sd(lifeExp),
    median_lifeExp = median(lifeExp),
    mean_pop = mean(pop),
    sd_pop = sd(pop),
    median_pop = median(pop),
    mean_gdpPercap = mean(gdpPercap),
    sd_gdpPercap = sd(gdpPercap),
    median_gdpPercap = median(gdpPercap)
  )
data_stats
## # A tibble: 12 × 10
##     year mean_lifeExp sd_lifeExp median_lifeExp  mean_pop     sd_pop median_pop
##    <int>        <dbl>      <dbl>          <dbl>     <dbl>      <dbl>      <dbl>
##  1  1952         49.1       12.2           45.1 16950402.  58100863.   3943953 
##  2  1957         51.5       12.2           48.4 18763413.  65504285.   4282942 
##  3  1962         53.6       12.1           50.9 20421007.  69788650.   4686040.
##  4  1967         55.7       11.7           53.8 22658298.  78375481.   5170176.
##  5  1972         57.6       11.4           56.5 25189980.  88646817.   5877996.
##  6  1977         59.6       11.2           59.7 27676379.  97481091.   6404036.
##  7  1982         61.5       10.8           62.4 30207302. 105098650.   7007320 
##  8  1987         63.2       10.6           65.8 33038573. 114756180.   7774862.
##  9  1992         64.2       11.2           67.7 35990917. 124502589.   8688686.
## 10  1997         65.0       11.6           69.4 38839468. 133417391.   9735064.
## 11  2002         65.7       12.3           70.8 41457589. 140848283.  10372918.
## 12  2007         67.0       12.1           71.9 44021220. 147621398.  10517531 
## # ℹ 3 more variables: mean_gdpPercap <dbl>, sd_gdpPercap <dbl>,
## #   median_gdpPercap <dbl>

Quantiles for GDP per Capita and Life Expentancy

quantiles <- data %>%
  summarise(
    q1_gdpPercap = quantile(gdpPercap, 0.25),
    q3_gdpPercap = quantile(gdpPercap, 0.75),
    q1_lifeExp = quantile(lifeExp, 0.25),
    q3_lifeExp = quantile(lifeExp, 0.75)
  )
quantiles
## # A tibble: 1 × 4
##   q1_gdpPercap q3_gdpPercap q1_lifeExp q3_lifeExp
##          <dbl>        <dbl>      <dbl>      <dbl>
## 1        1202.        9325.       48.2       70.8

Life expectancy changes across continents

continent_lifeExp <- data %>%
  group_by(continent, year) %>%
  summarise(mean_lifeExp = mean(lifeExp))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
continent_lifeExp
## # A tibble: 60 × 3
## # Groups:   continent [5]
##    continent  year mean_lifeExp
##    <fct>     <int>        <dbl>
##  1 Africa     1952         39.1
##  2 Africa     1957         41.3
##  3 Africa     1962         43.3
##  4 Africa     1967         45.3
##  5 Africa     1972         47.5
##  6 Africa     1977         49.6
##  7 Africa     1982         51.6
##  8 Africa     1987         53.3
##  9 Africa     1992         53.6
## 10 Africa     1997         53.6
## # ℹ 50 more rows

GDP per Capita changes across continents

continent_gdpPercap <- data %>%
  group_by(continent, year) %>%
  summarise(mean_gdpPercap = mean(gdpPercap))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
continent_gdpPercap
## # A tibble: 60 × 3
## # Groups:   continent [5]
##    continent  year mean_gdpPercap
##    <fct>     <int>          <dbl>
##  1 Africa     1952          1253.
##  2 Africa     1957          1385.
##  3 Africa     1962          1598.
##  4 Africa     1967          2050.
##  5 Africa     1972          2340.
##  6 Africa     1977          2586.
##  7 Africa     1982          2482.
##  8 Africa     1987          2283.
##  9 Africa     1992          2282.
## 10 Africa     1997          2379.
## # ℹ 50 more rows

Highest Life Expectancy and GDP per Capita by year

highest_lifeExp <- data %>%
  group_by(year) %>%
  filter(gdpPercap == max(gdpPercap)) %>%
  select(year, country, continent, gdpPercap, lifeExp) %>%
  arrange(year)

highest_lifeExp
## # A tibble: 12 × 5
## # Groups:   year [12]
##     year country      continent gdpPercap lifeExp
##    <int> <fct>        <fct>         <dbl>   <dbl>
##  1  1952 Kuwait       Asia        108382.    55.6
##  2  1957 Kuwait       Asia        113523.    58.0
##  3  1962 Kuwait       Asia         95458.    60.5
##  4  1967 Kuwait       Asia         80895.    64.6
##  5  1972 Kuwait       Asia        109348.    67.7
##  6  1977 Kuwait       Asia         59265.    69.3
##  7  1982 Saudi Arabia Asia         33693.    63.0
##  8  1987 Norway       Europe       31541.    75.9
##  9  1992 Kuwait       Asia         34933.    75.2
## 10  1997 Norway       Europe       41283.    78.3
## 11  2002 Norway       Europe       44684.    79.0
## 12  2007 Norway       Europe       49357.    80.2
highest_gdpPercap <- data %>%
  group_by(year) %>%
  filter(lifeExp == max(lifeExp)) %>%
  select(year, country, continent, gdpPercap, lifeExp) %>%
  arrange(year)

highest_gdpPercap
## # A tibble: 12 × 5
## # Groups:   year [12]
##     year country continent gdpPercap lifeExp
##    <int> <fct>   <fct>         <dbl>   <dbl>
##  1  1952 Norway  Europe       10095.    72.7
##  2  1957 Iceland Europe        9244.    73.5
##  3  1962 Iceland Europe       10350.    73.7
##  4  1967 Sweden  Europe       15258.    74.2
##  5  1972 Sweden  Europe       17832.    74.7
##  6  1977 Iceland Europe       19655.    76.1
##  7  1982 Japan   Asia         19384.    77.1
##  8  1987 Japan   Asia         22376.    78.7
##  9  1992 Japan   Asia         26825.    79.4
## 10  1997 Japan   Asia         28817.    80.7
## 11  2002 Japan   Asia         28605.    82  
## 12  2007 Japan   Asia         31656.    82.6

Correlation between GDP per Capita and Life Expectancy

correlation <- cor(data$lifeExp, data$gdpPercap, method = 'pearson')
cat("Pearson correlation coefficient between life expectancy and GDP per Capita is", correlation)
## Pearson correlation coefficient between life expectancy and GDP per Capita is 0.5837062

Statistical Testing

ANOVA Test for GDP per Capita across continents

anova_gdpPercap <- aov(gdpPercap ~ continent, data = data)
summary_result <- summary(anova_gdpPercap)
p_value <- summary_result[[1]]$`Pr(>F)`[1]

cat("The p-value for the ANOVA test is", p_value, "\n")
## The p-value for the ANOVA test is 1.172126e-94
if (p_value < 0.05) {
  cat("The differences in GDP per capita across continents are statistically significant.")
} else {
  cat("The differences in GDP per capita across continents are not statistically significant.")
}
## The differences in GDP per capita across continents are statistically significant.

T-test Comparing Asia & Europe GDP per Capita

asia_gdp <- data %>% filter(continent == "Asia") %>% pull(gdpPercap)
europe_gdp <- data %>% filter(continent == "Europe") %>% pull(gdpPercap)

t_test_result <- t.test(asia_gdp, europe_gdp)
t_test_result
## 
##  Welch Two Sample t-test
## 
## data:  asia_gdp and europe_gdp
## t = -7.6278, df = 693.01, p-value = 7.904e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8257.753 -4876.897
## sample estimates:
## mean of x mean of y 
##   7902.15  14469.48
# Interpretation
p_value <- t_test_result$p.value
cat("The p-value for the t-test comparing GDP per capita between Asia and Europe is", p_value, "\n")
## The p-value for the t-test comparing GDP per capita between Asia and Europe is 7.903913e-14
if (p_value < 0.05) {
  cat("The difference in GDP per capita between Asia and Europe is statistically significant.")
} else {
  cat("The difference in GDP per capita between Asia and Europe is not statistically significant.")
}
## The difference in GDP per capita between Asia and Europe is statistically significant.

Trend Analysis

Life expectancy over time

ggplot(data, aes(x = year, y = lifeExp, color = continent)) +
  geom_line(stat = "summary", fun = mean) +
  labs(title = "Life Expectancy Over Time by Continent", x = "Year", y = "Life Expectancy")

GDP per Capita over time

ggplot(data, aes(x = year, y = gdpPercap, color = continent)) +
  geom_line(stat = "summary", fun = mean) +
  labs(title = "GDP per Capita Over Time", x = "Year", y = "GDP per Capita")

Correlation Matrix

numeric_data <- data %>% select(lifeExp, gdpPercap, pop)
correlation_matrix <- cor(numeric_data)
print(correlation_matrix)
##              lifeExp   gdpPercap         pop
## lifeExp   1.00000000  0.58370622  0.06495537
## gdpPercap 0.58370622  1.00000000 -0.02559958
## pop       0.06495537 -0.02559958  1.00000000
# Visualize Correlation Matrix
ggcorrplot(correlation_matrix, lab = TRUE)

Data Visualizations

Boxplot of GDP per Capita by Continent

ggplot(data, aes(x = continent, y = gdpPercap, fill = continent)) +
  geom_boxplot() +
  labs(title = "GDP per Capita by Continent", y = "GDP per Capita", x = "Continent")

Boxplot of Life Expectancy by Continent

ggplot(data, aes(x = continent, y = lifeExp, fill = continent)) +
  geom_boxplot() +
  labs(title = "Life Expectancy by Continent", y = "Life Expectancy", x = "Continent")

Scatterplot of GDP per Capita and Life Expectancy

ggplot(data, aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point() +
  labs(title = "Life Expectancy vs GDP per Capita", x = "GDP per Capita", y = "Life Expectancy")

Filtering & removing outliers

# Calculate IQR for GDP per Capita and Life Expectancy
gdp_iqr <- IQR(data$gdpPercap)
gdp_q1 <- quantile(data$gdpPercap, 0.25)
gdp_q3 <- quantile(data$gdpPercap, 0.75)

lifeExp_iqr <- IQR(data$lifeExp)
lifeExp_q1 <- quantile(data$lifeExp, 0.25)
lifeExp_q3 <- quantile(data$lifeExp, 0.75)

# Filter outliers
filtered_data <- data %>%
  filter(
    gdpPercap > (gdp_q1 - 1.5 * gdp_iqr) & gdpPercap < (gdp_q3 + 1.5 * gdp_iqr),
    lifeExp > (lifeExp_q1 - 1.5 * lifeExp_iqr) & lifeExp < (lifeExp_q3 + 1.5 * lifeExp_iqr)
  )

Boxplot of GDP per Capita by Continent without outliers

ggplot(filtered_data, aes(x = continent, y = gdpPercap, fill = continent)) +
  geom_boxplot() +
  labs(title = "GDP per Capita by Continent (Without Outliers)", y = "GDP per Capita", x = "Continent")

Boxplot of Life Expectancy by Continent without outliers

ggplot(filtered_data, aes(x = continent, y = lifeExp, fill = continent)) +
  geom_boxplot() +
  labs(title = "Life Expectancy by Continent (Without Outliers)", y = "Life Expectancy", x = "Continent")

Scatterplot of GDP per Capita and Life Expectancy without outliers

ggplot(filtered_data, aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point() +
  labs(title = "Life Expectancy vs GDP per Capita (Without Outliers)", x = "GDP per Capita", y = "Life Expectancy")