# Löschen aller Objekte aus dem Workspace
rm(list = ls())

# Setze einen CRAN-Spiegel
options(repos = c(CRAN = "https://cloud.r-project.org/"))

# Installieren der Packages
install.packages("data.table")
## 
##   There is a binary version available but the source version is later:
##             binary source needs_compilation
## data.table 1.14.10 1.15.4              TRUE
## installing the source package 'data.table'
install.packages("survival")
## 
##   There is a binary version available but the source version is later:
##          binary source needs_compilation
## survival  3.5-7  3.7-0              TRUE
## installing the source package 'survival'
install.packages("dplyr")
## 
## The downloaded binary packages are in
##  /var/folders/7h/41t12qvn21n5dlkhpcg8sbyh0000gp/T//Rtmp7Pr95W/downloaded_packages
install.packages("survey")
## 
##   There is a binary version available but the source version is later:
##        binary source needs_compilation
## survey  4.2-1  4.4-2              TRUE
## installing the source package 'survey'
install.packages("tidyverse")
## 
## The downloaded binary packages are in
##  /var/folders/7h/41t12qvn21n5dlkhpcg8sbyh0000gp/T//Rtmp7Pr95W/downloaded_packages
install.packages("readstata13")
## 
## The downloaded binary packages are in
##  /var/folders/7h/41t12qvn21n5dlkhpcg8sbyh0000gp/T//Rtmp7Pr95W/downloaded_packages
install.packages("fastDummies")
## 
## The downloaded binary packages are in
##  /var/folders/7h/41t12qvn21n5dlkhpcg8sbyh0000gp/T//Rtmp7Pr95W/downloaded_packages
# laden der libraries
library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(labelled)
library(naniar)
library(naniar)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(haven)
library(ggplot2)
library(lfe)
## Loading required package: Matrix
library(fastDummies)
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(labelled)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(childhoodmortality)
library(survival)
library(survey)
## Loading required package: grid
## 
## Attaching package: 'survey'
## The following object is masked from 'package:Hmisc':
## 
##     deff
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ tidyr::expand()       masks Matrix::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ tidyr::pack()         masks Matrix::pack()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ Hmisc::src()          masks dplyr::src()
## ✖ Hmisc::summarize()    masks dplyr::summarize()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ tidyr::unpack()       masks Matrix::unpack()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(ggplot2)
# Daten vorbereiten
# Lade die Daten
file_path <- "/Users/nilsborgmannprivate/Downloads/KEBR8BSV (1)/KEBR8BFL.SAV"
data <- read_sav(file_path)

# Extrahieren der relevanten Variablen und Filtern der Daten
filtered_data <- data %>%
  select(CASEID, B3, B7, V008, V106, V107, V024)   # Auswahl der relevanten Variablen

# Entfernen der Zeilen mit fehlenden Werten in V107
filtered_data_no_na <- filtered_data %>%
  filter(!is.na(V107))

# Anzahl der Individuen im Datensatz nach dem Filtern
num_individuals_after_filter <- nrow(filtered_data_no_na)
print(paste("Anzahl der Individuen nach dem Filtern:", num_individuals_after_filter))
## [1] "Anzahl der Individuen nach dem Filtern: 59508"
# Überblick über die Struktur der Daten
str(filtered_data_no_na)
## tibble [59,508 × 7] (S3: tbl_df/tbl/data.frame)
##  $ CASEID: chr [1:59508] "       1   7  2" "       1  10  1" "       1  13  2" "       1  13  2" ...
##   ..- attr(*, "label")= chr "Case Identification"
##   ..- attr(*, "format.spss")= chr "A15"
##   ..- attr(*, "display_width")= int 17
##  $ B3    : num [1:59508] 1272 1330 1432 1402 1319 ...
##   ..- attr(*, "label")= chr "Date of birth (CMC)"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 6
##  $ B7    : num [1:59508] NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Age at death (months, imputed)"
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 5
##  $ V008  : num [1:59508] 1468 1468 1468 1468 1468 ...
##   ..- attr(*, "label")= chr "Date of interview (CMC)"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 6
##  $ V106  : dbl+lbl [1:59508] 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 2...
##    ..@ label        : chr "Highest educational level"
##    ..@ format.spss  : chr "F1.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No education" "Primary" "Secondary" "Higher"
##  $ V107  : dbl+lbl [1:59508] 4, 8, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 3, 3, 3, 8, 8, 4...
##    ..@ label        : chr "Highest year of education"
##    ..@ format.spss  : chr "F2.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num 0
##    .. ..- attr(*, "names")= chr "No years completed at level V106"
##  $ V024  : dbl+lbl [1:59508] 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
##    ..@ label        : chr "Region"
##    ..@ format.spss  : chr "F2.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num [1:47] 1 2 3 4 5 6 7 8 9 10 ...
##    .. ..- attr(*, "names")= chr [1:47] "Mombasa" "Kwale" "Kilifi" "Tana River" ...
# Fehlende Werte in den Daten
missing_values <- colSums(is.na(filtered_data_no_na))
print(missing_values)
## CASEID     B3     B7   V008   V106   V107   V024 
##      0      0  56165      0      0      0      0
#Verteilung der Stichproben auf die Countys (Regionen)

# Anzahl der Individuen im Datensatz
num_individuals <- nrow(filtered_data_no_na)
print(num_individuals)
## [1] 59508
# Überprüfen Sie die Spaltennamen
colnames(filtered_data_no_na)
## [1] "CASEID" "B3"     "B7"     "V008"   "V106"   "V107"   "V024"
# Annahme: 'filtered_data_no_na' ist dein Dataframe nach dem Filtern der NA-Werte in V107
# Verteilung der Stichprobe auf die Countys (Regionen) anzeigen
county_distribution <- filtered_data_no_na %>%
  dplyr::group_by(V024) %>%
  dplyr::summarise(Anzahl = dplyr::n())

# Verteilung ausgeben
print(county_distribution)
## # A tibble: 47 × 2
##    V024              Anzahl
##    <dbl+lbl>          <int>
##  1  1 [Mombasa]        1171
##  2  2 [Kwale]          1131
##  3  3 [Kilifi]         1201
##  4  4 [Tana River]      818
##  5  5 [Lamu]           1253
##  6  6 [Taita Taveta]   1040
##  7  7 [Garissa]         217
##  8  8 [Wajir]           228
##  9  9 [Mandera]         182
## 10 10 [Marsabit]        414
## # ℹ 37 more rows
# Erstellen eines Diagramms der Verteilung der Individuen pro County
ggplot(county_distribution, aes(x = as.factor(V024), y = Anzahl)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Verteilung der Individuen pro County",
       x = "County",
       y = "Anzahl der Individuen") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Überprüfen, ob die Summe der Individuen pro County der Stichprobengrösse entspricht
total_count_from_distribution <- sum(county_distribution$Anzahl)

if(total_count_from_distribution == num_individuals_after_filter) {
  print("Die Summe der Individuen pro County entspricht der Anzahl der Individuen in der gefilterten Stichprobe.")
} else {
  print("Es gibt eine Diskrepanz zwischen der Summe der Individuen pro County und der Anzahl der Individuen in der gefilterten Stichprobe.")
}
## [1] "Die Summe der Individuen pro County entspricht der Anzahl der Individuen in der gefilterten Stichprobe."
print(paste("Summe der Individuen pro County:", total_count_from_distribution))
## [1] "Summe der Individuen pro County: 59508"
print(paste("Anzahl der Individuen in der gefilterten Stichprobe:", num_individuals_after_filter))
## [1] "Anzahl der Individuen in der gefilterten Stichprobe: 59508"
# Deskriptive Statistiken für Bildungsjahre (V107)
descriptive_stats <- filtered_data_no_na %>%
  summarise(Mean = mean(V107, na.rm = TRUE),
            Median = median(V107, na.rm = TRUE),
            SD = sd(V107, na.rm = TRUE),
            Min = min(V107, na.rm = TRUE),
            Max = max(V107, na.rm = TRUE))
print(descriptive_stats)
## # A tibble: 1 × 5
##    Mean Median    SD Min                                  Max      
##   <dbl>  <dbl> <dbl> <dbl+lbl>                            <dbl+lbl>
## 1  5.24      5  2.38 0 [No years completed at level V106] 12
# Verteilung der Bildungsjahre anzeigen
education_years_distribution <- filtered_data_no_na %>%
  dplyr::group_by(V107) %>%
  dplyr::summarise(Count = dplyr::n()) %>%
  dplyr::mutate(Percentage = (Count / sum(Count)) * 100)

# Verteilung ausgeben
print(education_years_distribution)
## # A tibble: 11 × 3
##    V107                                  Count Percentage
##    <dbl+lbl>                             <int>      <dbl>
##  1  0 [No years completed at level V106]   596    1.00   
##  2  1                                     2529    4.25   
##  3  2                                     6474   10.9    
##  4  3                                     6101   10.3    
##  5  4                                    12097   20.3    
##  6  5                                     2446    4.11   
##  7  6                                     4352    7.31   
##  8  7                                     7755   13.0    
##  9  8                                    17151   28.8    
## 10  9                                        4    0.00672
## 11 12                                        3    0.00504
# Verteilung der Bildungsjahre anzeigen
education_years_distribution <- filtered_data_no_na %>%
  dplyr::group_by(V107) %>%
  dplyr::summarise(Count = dplyr::n()) %>%
  dplyr::mutate(Percentage = (Count / sum(Count)) * 100)

# Diagramm der Verteilung der Bildungsjahre
ggplot(education_years_distribution, aes(x = as.factor(V107), y = Count, fill = as.factor(V107))) +
  geom_bar(stat = "identity") +
  labs(title = "Verteilung der Bildungsjahre",
       x = "Anzahl der Bildungsjahre",
       y = "Anzahl der Individuen") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Verteilung der Anzahl Schuljahre in den verschiedenen Countys berechnen und die Spalte umbenennen
# plyr entladen, falls geladen
if("package:plyr" %in% search()) detach("package:plyr", unload=TRUE)

# Verteilung der Anzahl Schuljahre in den verschiedenen Countys berechnen
county_schooling_distribution <- filtered_data_no_na %>%
  group_by(V024) %>%
  summarise(Avg_Years_of_Schooling = mean(V107, na.rm = TRUE)) %>%
  rename(County = V024)


# Verteilung ausgeben
print(county_schooling_distribution)
## # A tibble: 47 × 2
##    County            Avg_Years_of_Schooling
##    <dbl+lbl>                          <dbl>
##  1  1 [Mombasa]                        5.14
##  2  2 [Kwale]                          5.63
##  3  3 [Kilifi]                         5.07
##  4  4 [Tana River]                     5.47
##  5  5 [Lamu]                           5.19
##  6  6 [Taita Taveta]                   5.25
##  7  7 [Garissa]                        4.68
##  8  8 [Wajir]                          5.58
##  9  9 [Mandera]                        5.32
## 10 10 [Marsabit]                       5.76
## # ℹ 37 more rows
# Diagramm der Durchschnittsjahre der Schulbildung pro County erstellen
ggplot(county_schooling_distribution, aes(x = as.factor(County), y = Avg_Years_of_Schooling)) +
  geom_bar(stat = "identity", fill = "coral") +
  labs(title = "Durchschnittsjahre der Schulbildung pro County",
       x = "County",
       y = "Durchschnittsjahre der Schulbildung") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Erstellen der binären Variable für Kindersterblichkeit
filtered_data_no_na <- filtered_data_no_na %>%
  mutate(death_before_5 = ifelse(B7 < 60, 1, 0)) # Kinder, die vor dem 5. Geburtstag starben

# Anzahl und Anteil der Kinder, die vor dem 5. Lebensjahr gestorben sind, pro County
mortality_by_county <- filtered_data_no_na %>%
  group_by(V024) %>%
  summarise(Count = n(),
            Deaths = sum(death_before_5, na.rm = TRUE),
            Mortality_Rate = sum(death_before_5, na.rm = TRUE) / n()) %>%
  rename(County = V024)

# Ausgabe der Tabelle
print(mortality_by_county)
## # A tibble: 47 × 4
##    County            Count Deaths Mortality_Rate
##    <dbl+lbl>         <int>  <dbl>          <dbl>
##  1  1 [Mombasa]       1171     58         0.0495
##  2  2 [Kwale]         1131     26         0.0230
##  3  3 [Kilifi]        1201     45         0.0375
##  4  4 [Tana River]     818     40         0.0489
##  5  5 [Lamu]          1253     73         0.0583
##  6  6 [Taita Taveta]  1040     42         0.0404
##  7  7 [Garissa]        217      5         0.0230
##  8  8 [Wajir]          228     20         0.0877
##  9  9 [Mandera]        182      5         0.0275
## 10 10 [Marsabit]       414     13         0.0314
## # ℹ 37 more rows
# Diagramm der Kindersterblichkeit pro County erstellen
ggplot(mortality_by_county, aes(x = as.factor(County), y = Mortality_Rate)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Kindersterblichkeitsrate pro County",
       x = "County",
       y = "Kindersterblichkeitsrate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Visualisierung der Kindersterblichkeitsrate pro County
ggplot(mortality_by_county, aes(x = as.factor(County), y = Mortality_Rate)) +
  geom_bar(stat = "identity", fill = "salmon") +
  labs(title = "Kindersterblichkeitsrate pro County",
       x = "County",
       y = "Kindersterblichkeitsrate") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

#Vorbereitung des Lineare Regressionsmodell

# Extrahieren der relevanten Variablen und Filtern der Daten
filtered_data <- data %>%
  select(CASEID, B3, B7, V008, V106, V107, V024)   # Auswahl der relevanten Variablen

# Filtern der Daten (keine NA-Werte in V107)
filtered_data_no_na <- filtered_data %>%
  filter(!is.na(V107))

# Erstellen der binären Variable für Kindersterblichkeit
filtered_data_no_na <- filtered_data_no_na %>%
  mutate(death_before_5 = ifelse(B7 < 60, 1, 0)) # Kinder, die vor dem 5. Geburtstag starben

# Anzahl der Individuen nach Erstellung der binären Variable
num_individuals_with_death_variable <- nrow(filtered_data_no_na)
print(paste("Anzahl der Individuen nach Erstellung der binären Variable für Kindersterblichkeit:", num_individuals_with_death_variable))
## [1] "Anzahl der Individuen nach Erstellung der binären Variable für Kindersterblichkeit: 59508"
# Sicherstellen, dass V024 als Faktor behandelt wird
filtered_data_no_na <- filtered_data_no_na %>%
  mutate(V024 = as.factor(V024))

# Sicherstellen, dass die Struktur korrekt ist
str(filtered_data_no_na)
## tibble [59,508 × 8] (S3: tbl_df/tbl/data.frame)
##  $ CASEID        : chr [1:59508] "       1   7  2" "       1  10  1" "       1  13  2" "       1  13  2" ...
##   ..- attr(*, "label")= chr "Case Identification"
##   ..- attr(*, "format.spss")= chr "A15"
##   ..- attr(*, "display_width")= int 17
##  $ B3            : num [1:59508] 1272 1330 1432 1402 1319 ...
##   ..- attr(*, "label")= chr "Date of birth (CMC)"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 6
##  $ B7            : num [1:59508] NA NA NA NA NA NA NA NA NA NA ...
##   ..- attr(*, "label")= chr "Age at death (months, imputed)"
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "display_width")= int 5
##  $ V008          : num [1:59508] 1468 1468 1468 1468 1468 ...
##   ..- attr(*, "label")= chr "Date of interview (CMC)"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "display_width")= int 6
##  $ V106          : dbl+lbl [1:59508] 2, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 2...
##    ..@ label        : chr "Highest educational level"
##    ..@ format.spss  : chr "F1.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num [1:4] 0 1 2 3
##    .. ..- attr(*, "names")= chr [1:4] "No education" "Primary" "Secondary" "Higher"
##  $ V107          : dbl+lbl [1:59508] 4, 8, 4, 4, 4, 4, 4, 8, 8, 4, 4, 4, 3, 3, 3, 8, 8, 4...
##    ..@ label        : chr "Highest year of education"
##    ..@ format.spss  : chr "F2.0"
##    ..@ display_width: int 6
##    ..@ labels       : Named num 0
##    .. ..- attr(*, "names")= chr "No years completed at level V106"
##  $ V024          : Factor w/ 47 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ death_before_5: num [1:59508] NA NA NA NA NA NA NA NA NA NA ...
# Formel für felm-Modell erstellen
formula <- as.formula("death_before_5 ~ V107 | V024")

# Fixed Effects Modell ausführen
model_fe <- felm(formula, data = filtered_data_no_na)
summary(model_fe)
## 
## Call:
##    felm(formula = formula, data = filtered_data_no_na) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96279  0.06546  0.10406  0.13658  0.23111 
## 
## Coefficients:
##       Estimate Std. Error t value Pr(>|t|)
## V107 -0.001594   0.002407  -0.662    0.508
## 
## Residual standard error: 0.3157 on 3295 degrees of freedom
##   (56165 observations deleted due to missingness)
## Multiple R-squared(full model): 0.02037   Adjusted R-squared: 0.006392 
## Multiple R-squared(proj model): 0.000133   Adjusted R-squared: -0.01413 
## F-statistic(full model):1.457 on 47 and 3295 DF, p-value: 0.023 
## F-statistic(proj model): 0.4384 on 1 and 3295 DF, p-value: 0.5079
#nun noch für die Variable V106 

# Verteilung der Bildungsniveaus
education_level_distribution <- filtered_data_no_na %>%
  group_by(V106) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = (Count / sum(Count)) * 100)

# Ausgabe der Verteilung
print(education_level_distribution)
## # A tibble: 3 × 3
##   V106          Count Percentage
##   <dbl+lbl>     <int>      <dbl>
## 1 1 [Primary]   36001       60.5
## 2 2 [Secondary] 16462       27.7
## 3 3 [Higher]     7045       11.8
# Diagramm der Verteilung der Bildungsniveaus
ggplot(education_level_distribution, aes(x = as.factor(V106), y = Count, fill = as.factor(V106))) +
  geom_bar(stat = "identity") +
  labs(title = "Verteilung der Bildungsniveaus",
       x = "Bildungsniveau",
       y = "Anzahl der Individuen") +
  theme_minimal()

# Verteilung der Bildungsniveaus
education_level_distribution <- filtered_data_no_na %>%
  group_by(V106) %>%
  summarise(Count = n()) %>%
  mutate(Percentage = (Count / sum(Count)) * 100)

# Ausgabe der Verteilung
print(education_level_distribution)
## # A tibble: 3 × 3
##   V106          Count Percentage
##   <dbl+lbl>     <int>      <dbl>
## 1 1 [Primary]   36001       60.5
## 2 2 [Secondary] 16462       27.7
## 3 3 [Higher]     7045       11.8
# Überprüfen der Spaltennamen
colnames(filtered_data_no_na)
## [1] "CASEID"         "B3"             "B7"             "V008"          
## [5] "V106"           "V107"           "V024"           "death_before_5"
# Verteilung des höchsten Bildungsniveaus in den verschiedenen Countys meiner Stichprobe
county_education_distribution <- filtered_data_no_na %>%
  group_by(V024) %>%
  summarise(Avg_Education_Level = mean(V106, na.rm = TRUE)) %>%
  rename(County = V024)

print(county_education_distribution)
## # A tibble: 47 × 2
##    County Avg_Education_Level
##    <fct>                <dbl>
##  1 1                     1.61
##  2 2                     1.28
##  3 3                     1.27
##  4 4                     1.23
##  5 5                     1.26
##  6 6                     1.50
##  7 7                     1.52
##  8 8                     1.37
##  9 9                     1.31
## 10 10                    1.31
## # ℹ 37 more rows
# Diagramm der durchschnittlichen Bildungsniveaus pro County erstellen
ggplot(county_education_distribution, aes(x = as.factor(County), y = Avg_Education_Level)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Durchschnittliches Bildungsniveau pro County",
       x = "County",
       y = "Durchschnittliches Bildungsniveau") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Formel für das Fixed Effects Modell erstellen
formula_education <- as.formula("death_before_5 ~ V106 | V024")

# Fixed Effects Modell ausführen
model_fe_education <- felm(formula_education, data = filtered_data_no_na)
summary(model_fe_education)
## 
## Call:
##    felm(formula = formula_education, data = filtered_data_no_na) 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00669  0.06449  0.09893  0.13585  0.24110 
## 
## Coefficients:
##      Estimate Std. Error t value Pr(>|t|)    
## V106 0.038017   0.009415   4.038 5.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.315 on 3295 degrees of freedom
##   (56165 observations deleted due to missingness)
## Multiple R-squared(full model): 0.02506   Adjusted R-squared: 0.01115 
## Multiple R-squared(proj model): 0.004924   Adjusted R-squared: -0.00927 
## F-statistic(full model):1.802 on 47 and 3295 DF, p-value: 0.0007003 
## F-statistic(proj model):  16.3 on 1 and 3295 DF, p-value: 5.519e-05