This project analyzes the ‘Gapminder’ dataset on R to explore global trends in life expectancy and GDP.
library(gapminder)
library(dplyr)
library(ggplot2)
library(ggcorrplot)
library(rmarkdown)
data <- gapminder
summary(data)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
sum(is.na(data))
## [1] 0
duplicates <- data %>%
group_by(country, year) %>%
summarise(n = n()) %>%
filter(n > 1)
## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.
nrow(duplicates)
## [1] 0
str(data)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
year_range <- range(data$year)
year_range
## [1] 1952 2007
data_stats <- data %>%
group_by(year) %>%
summarise(
mean_lifeExp = mean(lifeExp),
sd_lifeExp = sd(lifeExp),
median_lifeExp = median(lifeExp),
mean_pop = mean(pop),
sd_pop = sd(pop),
median_pop = median(pop),
mean_gdpPercap = mean(gdpPercap),
sd_gdpPercap = sd(gdpPercap),
median_gdpPercap = median(gdpPercap)
)
data_stats
## # A tibble: 12 × 10
## year mean_lifeExp sd_lifeExp median_lifeExp mean_pop sd_pop median_pop
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1952 49.1 12.2 45.1 16950402. 58100863. 3943953
## 2 1957 51.5 12.2 48.4 18763413. 65504285. 4282942
## 3 1962 53.6 12.1 50.9 20421007. 69788650. 4686040.
## 4 1967 55.7 11.7 53.8 22658298. 78375481. 5170176.
## 5 1972 57.6 11.4 56.5 25189980. 88646817. 5877996.
## 6 1977 59.6 11.2 59.7 27676379. 97481091. 6404036.
## 7 1982 61.5 10.8 62.4 30207302. 105098650. 7007320
## 8 1987 63.2 10.6 65.8 33038573. 114756180. 7774862.
## 9 1992 64.2 11.2 67.7 35990917. 124502589. 8688686.
## 10 1997 65.0 11.6 69.4 38839468. 133417391. 9735064.
## 11 2002 65.7 12.3 70.8 41457589. 140848283. 10372918.
## 12 2007 67.0 12.1 71.9 44021220. 147621398. 10517531
## # ℹ 3 more variables: mean_gdpPercap <dbl>, sd_gdpPercap <dbl>,
## # median_gdpPercap <dbl>
quantiles <- data %>%
summarise(
q1_gdpPercap = quantile(gdpPercap, 0.25),
q3_gdpPercap = quantile(gdpPercap, 0.75),
q1_lifeExp = quantile(lifeExp, 0.25),
q3_lifeExp = quantile(lifeExp, 0.75)
)
quantiles
## # A tibble: 1 × 4
## q1_gdpPercap q3_gdpPercap q1_lifeExp q3_lifeExp
## <dbl> <dbl> <dbl> <dbl>
## 1 1202. 9325. 48.2 70.8
continent_lifeExp <- data %>%
group_by(continent, year) %>%
summarise(mean_lifeExp = mean(lifeExp))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
continent_lifeExp
## # A tibble: 60 × 3
## # Groups: continent [5]
## continent year mean_lifeExp
## <fct> <int> <dbl>
## 1 Africa 1952 39.1
## 2 Africa 1957 41.3
## 3 Africa 1962 43.3
## 4 Africa 1967 45.3
## 5 Africa 1972 47.5
## 6 Africa 1977 49.6
## 7 Africa 1982 51.6
## 8 Africa 1987 53.3
## 9 Africa 1992 53.6
## 10 Africa 1997 53.6
## # ℹ 50 more rows
continent_gdpPercap <- data %>%
group_by(continent, year) %>%
summarise(mean_gdpPercap = mean(gdpPercap))
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
continent_gdpPercap
## # A tibble: 60 × 3
## # Groups: continent [5]
## continent year mean_gdpPercap
## <fct> <int> <dbl>
## 1 Africa 1952 1253.
## 2 Africa 1957 1385.
## 3 Africa 1962 1598.
## 4 Africa 1967 2050.
## 5 Africa 1972 2340.
## 6 Africa 1977 2586.
## 7 Africa 1982 2482.
## 8 Africa 1987 2283.
## 9 Africa 1992 2282.
## 10 Africa 1997 2379.
## # ℹ 50 more rows
highest_lifeExp <- data %>%
group_by(year) %>%
filter(gdpPercap == max(gdpPercap)) %>%
select(year, country, continent, gdpPercap, lifeExp) %>%
arrange(year)
highest_lifeExp
## # A tibble: 12 × 5
## # Groups: year [12]
## year country continent gdpPercap lifeExp
## <int> <fct> <fct> <dbl> <dbl>
## 1 1952 Kuwait Asia 108382. 55.6
## 2 1957 Kuwait Asia 113523. 58.0
## 3 1962 Kuwait Asia 95458. 60.5
## 4 1967 Kuwait Asia 80895. 64.6
## 5 1972 Kuwait Asia 109348. 67.7
## 6 1977 Kuwait Asia 59265. 69.3
## 7 1982 Saudi Arabia Asia 33693. 63.0
## 8 1987 Norway Europe 31541. 75.9
## 9 1992 Kuwait Asia 34933. 75.2
## 10 1997 Norway Europe 41283. 78.3
## 11 2002 Norway Europe 44684. 79.0
## 12 2007 Norway Europe 49357. 80.2
highest_gdpPercap <- data %>%
group_by(year) %>%
filter(lifeExp == max(lifeExp)) %>%
select(year, country, continent, gdpPercap, lifeExp) %>%
arrange(year)
highest_gdpPercap
## # A tibble: 12 × 5
## # Groups: year [12]
## year country continent gdpPercap lifeExp
## <int> <fct> <fct> <dbl> <dbl>
## 1 1952 Norway Europe 10095. 72.7
## 2 1957 Iceland Europe 9244. 73.5
## 3 1962 Iceland Europe 10350. 73.7
## 4 1967 Sweden Europe 15258. 74.2
## 5 1972 Sweden Europe 17832. 74.7
## 6 1977 Iceland Europe 19655. 76.1
## 7 1982 Japan Asia 19384. 77.1
## 8 1987 Japan Asia 22376. 78.7
## 9 1992 Japan Asia 26825. 79.4
## 10 1997 Japan Asia 28817. 80.7
## 11 2002 Japan Asia 28605. 82
## 12 2007 Japan Asia 31656. 82.6
correlation <- cor(data$lifeExp, data$gdpPercap, method = 'pearson')
cat("Pearson correlation coefficient between life expectancy and GDP per Capita is", correlation)
## Pearson correlation coefficient between life expectancy and GDP per Capita is 0.5837062
anova_gdpPercap <- aov(gdpPercap ~ continent, data = data)
summary_result <- summary(anova_gdpPercap)
p_value <- summary_result[[1]]$`Pr(>F)`[1]
cat("The p-value for the ANOVA test is", p_value, "\n")
## The p-value for the ANOVA test is 1.172126e-94
if (p_value < 0.05) {
cat("The differences in GDP per capita across continents are statistically significant.")
} else {
cat("The differences in GDP per capita across continents are not statistically significant.")
}
## The differences in GDP per capita across continents are statistically significant.
asia_gdp <- data %>% filter(continent == "Asia") %>% pull(gdpPercap)
europe_gdp <- data %>% filter(continent == "Europe") %>% pull(gdpPercap)
t_test_result <- t.test(asia_gdp, europe_gdp)
t_test_result
##
## Welch Two Sample t-test
##
## data: asia_gdp and europe_gdp
## t = -7.6278, df = 693.01, p-value = 7.904e-14
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8257.753 -4876.897
## sample estimates:
## mean of x mean of y
## 7902.15 14469.48
# Interpretation
p_value <- t_test_result$p.value
cat("The p-value for the t-test comparing GDP per capita between Asia and Europe is", p_value, "\n")
## The p-value for the t-test comparing GDP per capita between Asia and Europe is 7.903913e-14
if (p_value < 0.05) {
cat("The difference in GDP per capita between Asia and Europe is statistically significant.")
} else {
cat("The difference in GDP per capita between Asia and Europe is not statistically significant.")
}
## The difference in GDP per capita between Asia and Europe is statistically significant.
ggplot(data, aes(x = year, y = lifeExp, color = continent)) +
geom_line(stat = "summary", fun = mean) +
labs(title = "Life Expectancy Over Time by Continent", x = "Year", y = "Life Expectancy")
ggplot(data, aes(x = year, y = gdpPercap, color = continent)) +
geom_line(stat = "summary", fun = mean) +
labs(title = "GDP per Capita Over Time", x = "Year", y = "GDP per Capita")
numeric_data <- data %>% select(lifeExp, gdpPercap, pop)
correlation_matrix <- cor(numeric_data)
print(correlation_matrix)
## lifeExp gdpPercap pop
## lifeExp 1.00000000 0.58370622 0.06495537
## gdpPercap 0.58370622 1.00000000 -0.02559958
## pop 0.06495537 -0.02559958 1.00000000
# Visualize Correlation Matrix
ggcorrplot(correlation_matrix, lab = TRUE)
ggplot(data, aes(x = continent, y = gdpPercap, fill = continent)) +
geom_boxplot() +
labs(title = "GDP per Capita by Continent", y = "GDP per Capita", x = "Continent")
ggplot(data, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot() +
labs(title = "Life Expectancy by Continent", y = "Life Expectancy", x = "Continent")
ggplot(data, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
labs(title = "Life Expectancy vs GDP per Capita", x = "GDP per Capita", y = "Life Expectancy")
# Calculate IQR for GDP per Capita and Life Expectancy
gdp_iqr <- IQR(data$gdpPercap)
gdp_q1 <- quantile(data$gdpPercap, 0.25)
gdp_q3 <- quantile(data$gdpPercap, 0.75)
lifeExp_iqr <- IQR(data$lifeExp)
lifeExp_q1 <- quantile(data$lifeExp, 0.25)
lifeExp_q3 <- quantile(data$lifeExp, 0.75)
# Filter outliers
filtered_data <- data %>%
filter(
gdpPercap > (gdp_q1 - 1.5 * gdp_iqr) & gdpPercap < (gdp_q3 + 1.5 * gdp_iqr),
lifeExp > (lifeExp_q1 - 1.5 * lifeExp_iqr) & lifeExp < (lifeExp_q3 + 1.5 * lifeExp_iqr)
)
ggplot(filtered_data, aes(x = continent, y = gdpPercap, fill = continent)) +
geom_boxplot() +
labs(title = "GDP per Capita by Continent (Without Outliers)", y = "GDP per Capita", x = "Continent")
ggplot(filtered_data, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot() +
labs(title = "Life Expectancy by Continent (Without Outliers)", y = "Life Expectancy", x = "Continent")
ggplot(filtered_data, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point() +
labs(title = "Life Expectancy vs GDP per Capita (Without Outliers)", x = "GDP per Capita", y = "Life Expectancy")