Learning about EDA

R
EDE
Analysis
Author

Nina Little

Published

December 6, 2023

Introduction to Exploratory Analysis (EDA)

We will walk to how to perform an EDA on a small subset of data

Load the appropriate libraries

# Install and load necessary packages if not already installed
# install.packages("tidyverse")

library(tidyverse)
Warning: package 'dplyr' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

Load the dataset

PassData <- read.csv("Titanic Passengers.csv")

Basic Exploration of the data

The code below helps to understand the data

head(PassData)
                                             Name Survived Passenger.Class
1                   Allen, Miss. Elisabeth Walton        1               1
2                  Allison, Master. Hudson Trevor        1               1
3                    Allison, Miss. Helen Loraine        0               1
4            Allison, Mr. Hudson Joshua Creighton        0               1
5 Allison, Mrs. Hudson J C (Bessie Waldo Daniels)        0               1
6                             Anderson, Mr. Harry        1               1
     Sex     Age SiblingsandSpouses ParentsandChildren     Fare Port
1 female 29.0000                  0                  0 211.3375    S
2   male  0.9167                  1                  2 151.5500    S
3 female  2.0000                  1                  2 151.5500    S
4   male 30.0000                  1                  2 151.5500    S
5 female 25.0000                  1                  2 151.5500    S
6   male 48.0000                  0                  0  26.5500    S
               Home...Destination Validation
1                    St Louis, MO          1
2 Montreal, PQ / Chesterville, ON          1
3 Montreal, PQ / Chesterville, ON          0
4 Montreal, PQ / Chesterville, ON          0
5 Montreal, PQ / Chesterville, ON          0
6                    New York, NY          0
summary(PassData)
     Name              Survived     Passenger.Class     Sex           
 Length:1309        Min.   :0.000   Min.   :1.000   Length:1309       
 Class :character   1st Qu.:0.000   1st Qu.:2.000   Class :character  
 Mode  :character   Median :0.000   Median :3.000   Mode  :character  
                    Mean   :0.382   Mean   :2.295                     
                    3rd Qu.:1.000   3rd Qu.:3.000                     
                    Max.   :1.000   Max.   :3.000                     
                                                                      
      Age          SiblingsandSpouses ParentsandChildren      Fare        
 Min.   : 0.1667   Min.   :0.0000     Min.   :0.000      Min.   :  0.000  
 1st Qu.:21.0000   1st Qu.:0.0000     1st Qu.:0.000      1st Qu.:  7.896  
 Median :28.0000   Median :0.0000     Median :0.000      Median : 14.454  
 Mean   :29.8811   Mean   :0.4989     Mean   :0.385      Mean   : 33.295  
 3rd Qu.:39.0000   3rd Qu.:1.0000     3rd Qu.:0.000      3rd Qu.: 31.275  
 Max.   :80.0000   Max.   :8.0000     Max.   :9.000      Max.   :512.329  
 NA's   :263                                             NA's   :1        
     Port           Home...Destination   Validation    
 Length:1309        Length:1309        Min.   :0.0000  
 Class :character   Class :character   1st Qu.:0.0000  
 Mode  :character   Mode  :character   Median :0.0000  
                                       Mean   :0.3048  
                                       3rd Qu.:1.0000  
                                       Max.   :1.0000  
                                                       
str(PassData)
'data.frame':   1309 obs. of  11 variables:
 $ Name              : chr  "Allen, Miss. Elisabeth Walton" "Allison, Master. Hudson Trevor" "Allison, Miss. Helen Loraine" "Allison, Mr. Hudson Joshua Creighton" ...
 $ Survived          : int  1 1 0 0 0 1 1 0 1 0 ...
 $ Passenger.Class   : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Sex               : chr  "female" "male" "female" "male" ...
 $ Age               : num  29 0.917 2 30 25 ...
 $ SiblingsandSpouses: int  0 1 1 1 1 0 1 0 2 0 ...
 $ ParentsandChildren: int  0 2 2 2 2 0 0 0 0 0 ...
 $ Fare              : num  211 152 152 152 152 ...
 $ Port              : chr  "S" "S" "S" "S" ...
 $ Home...Destination: chr  "St Louis, MO" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" "Montreal, PQ / Chesterville, ON" ...
 $ Validation        : int  1 1 0 0 0 0 1 1 1 0 ...
dim(PassData)
[1] 1309   11

Evaluate the data for Data Cleaning

Addressing missing values, removing duplicates and displaying the column names

# Check for missing values
sum(is.na(PassData))
[1] 264
# Remove duplicates
petfood <- distinct(PassData)

# Display column names
names(PassData)
 [1] "Name"               "Survived"           "Passenger.Class"   
 [4] "Sex"                "Age"                "SiblingsandSpouses"
 [7] "ParentsandChildren" "Fare"               "Port"              
[10] "Home...Destination" "Validation"        

Univariate Analysis

Display the data in a histogram

# Create a histogram for a numeric variable (replace 'weight' with an actual variable)
ggplot(PassData, aes(x = Fare)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  labs(title = "Histogram of Weight") +
  xlab("Weight")
Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

Display the data in a box plot

# Box plot for a numeric variable
# Box plot for the first numeric variable
boxplot(PassData$Fare)

# Box plot for the second numeric variable
boxplot(PassData$Age)

Statistics

Review the statistics of the data set

# Summary statistics 

summary(PassData)
     Name              Survived     Passenger.Class     Sex           
 Length:1309        Min.   :0.000   Min.   :1.000   Length:1309       
 Class :character   1st Qu.:0.000   1st Qu.:2.000   Class :character  
 Mode  :character   Median :0.000   Median :3.000   Mode  :character  
                    Mean   :0.382   Mean   :2.295                     
                    3rd Qu.:1.000   3rd Qu.:3.000                     
                    Max.   :1.000   Max.   :3.000                     
                                                                      
      Age          SiblingsandSpouses ParentsandChildren      Fare        
 Min.   : 0.1667   Min.   :0.0000     Min.   :0.000      Min.   :  0.000  
 1st Qu.:21.0000   1st Qu.:0.0000     1st Qu.:0.000      1st Qu.:  7.896  
 Median :28.0000   Median :0.0000     Median :0.000      Median : 14.454  
 Mean   :29.8811   Mean   :0.4989     Mean   :0.385      Mean   : 33.295  
 3rd Qu.:39.0000   3rd Qu.:1.0000     3rd Qu.:0.000      3rd Qu.: 31.275  
 Max.   :80.0000   Max.   :8.0000     Max.   :9.000      Max.   :512.329  
 NA's   :263                                             NA's   :1        
     Port           Home...Destination   Validation    
 Length:1309        Length:1309        Min.   :0.0000  
 Class :character   Class :character   1st Qu.:0.0000  
 Mode  :character   Mode  :character   Median :0.0000  
                                       Mean   :0.3048  
                                       3rd Qu.:1.0000  
                                       Max.   :1.0000  
                                                       

Correlation Matrix

# Correlation matrix
cor(PassData[, c("Fare", "Age")])
     Fare Age
Fare    1  NA
Age    NA   1

Heatmaps

# Heatmap using ggplot2
ggplot(PassData, aes(x = Age, y = SiblingsandSpouses, fill = Survived)) +
  geom_tile()
Warning: Removed 263 rows containing missing values (`geom_tile()`).

These steps are essential to use and apply before to developing any models.