Data analysis.Rmd

---
title: "test report"
output: html_document
date: "2023-05-05"
---

```{r setup, echo = FALSE, warning = FALSE, message = FALSE}
library(knitr)
library(scales)
library(plyr)
library(tidyverse)
library(readxl)
library(openxlsx)
library(ggpubr)
```

```{r read in council data, echo = FALSE}
# Directory path 
# (Folder: Research - Formatted Data, has been synced to personal drive)
folder_path <- "C:/Users/connachan.cara/IS/Research - Formatted Data/"

# Function to read clean data sheet of each council file and combine into 1 df
read_files <- function(directory_path, file) {
  # Paste the folder parth with the file name for the council in the loop
  xlsx_file <- paste0(directory_path, file)
  # read through each clean data sheet in each file and store in a data frame
  map_df("Clean Data", 
         read_excel,
         # specify column types so each files reads in the same
         # for some the date column is a date rather than text so need to convert
         col_types = c("text", 
                       "text", 
                       "text",
                       "text",
                       "text",
                       "numeric"
                       ),
         path = xlsx_file
         ) 
}

# List all the council files in the folder
council_data <- list.files(folder_path) %>% 
  # Don't read in SHS data
  .[!. == "SHS 2021.xlsx"] %>%
  # Don't read in SSCQ data
  .[!. == "SSCQ 2019.xlsx"] %>%
  # Don't read in Gender pay gap data
  .[!. == "Gender Pay Gap.xlsx"] %>%
  # Run the read_files function, looping through each council file and store in a data frame
    map_df(~read_files(folder_path, .))
```

```{r read in SHS data, echo = FALSE}
# Scotland level population comparator
shs_data <- read_excel(
  "C:/Users/connachan.cara/IS/Research - Formatted Data/SHS 2021.xlsx",
  sheet = "Clean Data",
  col_types = c("text", 
                "text", 
                "text",
                "text",
                "text",
                "numeric"
                )
                       )
# Add column with percentages within each characteristic
shs_data <- shs_data %>%
  group_by(Characteristic) %>%
  mutate(Percent = round((Value/sum(Value)) * 100, 1)) %>%
  # Add column with dataset label
  mutate(Dataset = "Scotland - Adult Population (SHS)") %>%
  select(-Council, -`Period Covered`, -Date)
```

``` {r read in SSCQ data, echo = FALSE, warning = FALSE}
# Council level population comparator
sscq_data <- read_excel(
  "C:/Users/connachan.cara/IS/Research - Formatted Data/SSCQ 2019.xlsx",
  sheet = "Clean Data",
  col_types = c("text",
                "text",
                "text", 
                "text",
                "text",
                "text",
                "numeric"
                )
                       )

sscq_data$Council[sscq_data$Council == "Edinburgh, City of"] <- "City of Edinburgh"
```

```{r read in pay gap data, echo = FALSE}
pay_gap_data <- read_excel("C:/Users/connachan.cara/IS/Research - Formatted Data/Gender Pay Gap.xlsx")
```

``` {r create council percentages, echo = FALSE}
council_data_no_other <- council_data %>%
  filter(!Measure %in% c(
    "Sex - Other (e.g. Prefer not to answer / not disclosed)", 
    "Age - Other (e.g. Prefer not to answer / not disclosed)", 
    "Disability - Other (e.g. Prefer not to answer / not disclosed)",
    "Ethnicity - Other (e.g. Prefer not to answer / not disclosed)",
    "Religion - Other (e.g. Prefer not to answer / not disclosed)", 
    "Sexual Orientation - Other (e.g. prefer not to answer / not disclosed)", 
    "Marital Status - Other (e.g. prefer not to answer / not disclosed)"
  )
  ) %>%
  group_by(Council, Characteristic) %>%
  mutate(Percent = round((Value/sum(Value, na.rm = TRUE)) * 100, 1)) %>%
  ungroup()
```

```{r create scotland data with other, echo = FALSE}
scotland_data_with_other <- council_data %>%
  group_by(Characteristic, Measure) %>%
  summarise(Value = sum(Value, na.rm = TRUE)) %>%
  ungroup() %>%
  # Filter to exclude unique categories, these are categories used by one or more councils 
  # therefore are not comparable at a Scotland level
  filter(!Characteristic %in% c("Unique Age Categories", 
                             "Unique Sexual Orientation Categories",
                             "SSCQ Ethnicity Categories"))

# Add column with percentages within each characteristic
scotland_data_with_other <- scotland_data_with_other %>%
  group_by(Characteristic) %>%
  mutate(Percent = round((Value/sum(Value)) * 100, 1)) %>%
  # Add column with dataset label
  mutate(Dataset = "Scotland - Council Employees") %>%
  select(-Value) %>%
  filter(Measure %in% c(
    "Sex - Other (e.g. Prefer not to answer / not disclosed)", 
    "Age - Other (e.g. Prefer not to answer / not disclosed)", 
    "Disability - Other (e.g. Prefer not to answer / not disclosed)",
    "Ethnicity - Other (e.g. Prefer not to answer / not disclosed)",
    "Religion - Other (e.g. Prefer not to answer / not disclosed)", 
    "Sexual Orientation - Other (e.g. prefer not to answer / not disclosed)", 
    "Marital Status - Other (e.g. prefer not to answer / not disclosed)"
  )
  ) 

kable(scotland_data_with_other)
  
```

``` {r aggregate Scotland data, echo = FALSE, warning = FALSE}
# Add councils values together for each measure
scotland_data <- council_data_no_other %>%
  group_by(Characteristic, Measure) %>%
  summarise(Value = sum(Value, na.rm = TRUE)) %>%
  ungroup() %>%
  # Filter to exclude unique categories, these are categories used by one or more councils 
  # therefore are not comparable at a Scotland level
  filter(!Characteristic %in% c("Unique Age Categories", 
                             "Unique Sexual Orientation Categories",
                             "SSCQ Ethnicity Categories"))

# Add column with percentages within each characteristic
scotland_data <- scotland_data %>%
  group_by(Characteristic) %>%
  mutate(Percent = round((Value/sum(Value)) * 100, 1)) %>%
  # Add column with dataset label
  mutate(Dataset = "Scotland - Council Employees") %>%
  select(-Value)
```

```{r comparative dataset, echo = FALSE, warning = FALSE}
# Format sscq data
scotland_sscq <- sscq_data %>%
  filter(!Measure %in% c(
    "Sex - Other (e.g. Prefer not to answer / not disclosed)", 
    "Age - Other (e.g. Prefer not to answer / not disclosed)", 
    "Disability - Other (e.g. Prefer not to answer / not disclosed)",
    "Ethnicity - Other (e.g. Prefer not to answer / not disclosed)",
    "Religion - Other (e.g. Prefer not to answer / not disclosed)", 
    "Sexual Orientation - Other (e.g. prefer not to answer / not disclosed)", 
    "Marital Status - Other (e.g. prefer not to answer / not disclosed)"
  )
  ) %>%
  filter(Council == "Scotland") %>%
  rename(Percent = Percentage) %>%
  select(Characteristic, Measure, Percent, Dataset)
# Combine Scotland level employee data with SSCQ data
comparative_data <- rbind(scotland_data, scotland_sscq)
# Set as factor to keep order of the bars
comparative_data$Dataset <- factor(comparative_data$Dataset, 
                                   levels = c(
                                     "Scotland - Council Employees",
                                     "Scotland - Adult Population (SSCQ)"
                                     )
                                   )

# Rename variables so they look nicer/more informative in graphs
comparative_data$Measure[comparative_data$Measure == "Yes"] <- "Disability"
comparative_data$Measure[comparative_data$Measure == "No"] <- "No Disability"
comparative_data$Measure[comparative_data$Measure == "LGB"] <- "Lesbian, Gay or Bisexual"
comparative_data$Measure[comparative_data$Measure == "Married/Civil partnership"] <- "Married / Civil partnership"
comparative_data$Measure[comparative_data$Measure == "Never married - single/DK"] <- "Single"
comparative_data$Measure[comparative_data$Measure == "Divorced/Dissolved civil partnership"] <- "Divorced / Dissolved civil partnership"
comparative_data$Measure[comparative_data$Measure == "Widowed/Bereaved civil partner"] <- "Widowed / Bereaved civil partner"

# Rename Characterstics so they can be used as titles
comparative_data$Characteristic[comparative_data$Characteristic == "Comparable Ethnicity Categories"] <- "Ethnicity"
comparative_data$Characteristic[comparative_data$Characteristic == "SSCQ Age Categories"] <- "Age"
comparative_data$Characteristic[comparative_data$Characteristic == "SSCQ Marital Status"] <- "Marital Status"
comparative_data$Characteristic[comparative_data$Characteristic == "SSCQ Religion Categories"] <- "Religion"
comparative_data$Characteristic[comparative_data$Characteristic == "SSCQ Sexual Orientation Categories"] <- "Sexual Orientation"

```

``` {r function to create Scotland graphs, echo = FALSE}
create_scotland_graphs <- function(graph_characteristic, factor_levels) {
  filtered_data <- comparative_data %>% 
    filter(Characteristic == graph_characteristic)
  ggplot(filtered_data, aes(x = factor(Measure, factor_levels), 
                            y = Percent, fill = Dataset
                            )
         ) +
  geom_bar(position = "dodge", stat = "identity", color = "grey") +
  scale_fill_brewer(palette = "Paired", direction = -1) +
  # Data Labels
  geom_text(aes(label = paste0(Percent, "%")), 
            vjust = -0.5, 
            color = "black", 
            position = position_dodge(0.9), 
            size = 2.5
            ) +
    ggtitle(graph_characteristic) +
  # Wrap long labels
  scale_x_discrete(labels = function(x) str_wrap(x, width = 12)) +
  theme_minimal() +
  # Remove axis labels
  theme(axis.title.x = element_blank(), 
        axis.title.y = element_blank(),
        legend.position = "bottom",
        legend.title = element_blank()
)
}

```

```{r Sex Scotland, echo = FALSE, warning = FALSE}
sex_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Sex",
  factor_levels = c(
    "Female",
    "Male"
    )
  )
sex_scotland_graph
```

```{r Age Scotland, echo = FALSE, warning = FALSE}
age_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Age", 
  factor_levels = c(
    "16-24", 
    "25-34",
    "35-44",
    "45-54",
    "55-64",
    "65-74",
    "75+"
    )
  )
age_scotland_graph
```

```{r Disability Scotland, echo = FALSE, warning = FALSE}
disability_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Disability", 
  factor_levels = c(
    "Disability", 
    "No Disability"
    )
  )
disability_scotland_graph
```


```{r Ethnicity Scotland, echo = FALSE, warning = FALSE}
ethnicity_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Ethnicity", 
  factor_levels = c(
    "White", 
    "Minority Ethnicities"
    )
  )
ethnicity_scotland_graph
```

```{r Religion Scotland, echo = FALSE, warning = FALSE}
religion_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Religion", 
  factor_levels = c(
    "None", 
    "Church of Scotland",  
    "Roman Catholic",
    "Other Christian", 
    "Muslim"
    )
  )
religion_scotland_graph
```

```{r Sexual Orientation Scotland, echo = FALSE, warning = FALSE}
sexual_orientation_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Sexual Orientation",
  factor_levels = c(
    "Heterosexual", 
    "Lesbian, Gay or Bisexual"
    )
  )

sexual_orientation_scotland_graph
```

```{r Marital Status Scotland, echo = FALSE, warning = FALSE}
marital_status_scotland_graph <- create_scotland_graphs(
  graph_characteristic = "Marital Status", 
  factor_levels = c(
    "Married / Civil partnership", 
    "Single", 
    "Divorced / Dissolved civil partnership", 
    "Seperated", 
    "Widowed / Bereaved civil partner"
    )
  )
marital_status_scotland_graph
```

``` {r workforce and population data, echo = FALSE}
council_data_formatted <- council_data_no_other %>%
  select(-`Period Covered`, -Date, -Value) %>%
  rename(`Council Workforce` = Percent)

sscq_data_formatted <- sscq_data %>%
  select(-Dataset, -`Period Covered`, -Date) %>%
  rename(`Council Population` = Percentage)

workforce_pop_data <- merge(council_data_formatted, sscq_data_formatted) 

```

```{r gender diff, echo = FALSE, warning = FALSE}
gender_diff_data <- workforce_pop_data %>%
  filter(Characteristic == "Sex" & Measure == "Female") %>%
  mutate(Diff = `Council Workforce` - `Council Population`)

pop_min <- gender_diff_data %>%
  filter(`Council Population` == min(`Council Population`)) %>%
  mutate(Criteria = "Population min")

pop_max <- gender_diff_data %>%
  filter(`Council Population` == max(`Council Population`)) %>%
  mutate(Criteria = "Population max")   

workforce_min <- gender_diff_data %>%
  filter(`Council Workforce` == min(`Council Workforce`)) %>%
  mutate(Criteria = "Workforce min")

workforce_max <- gender_diff_data %>%
  filter(`Council Workforce` == max(`Council Workforce`)) %>%
  mutate(Criteria = "Workforce max")

diff_max <- gender_diff_data %>%
  filter(Diff == max(Diff)) %>%
  mutate(Criteria = "Difference max")

gender_range_summary <- rbind(pop_min, pop_max) %>%
  rbind(workforce_min) %>%
  rbind(workforce_max) %>%
  rbind(diff_max)

kable(gender_range_summary)
```

```{r gender range plot, echo = FALSE}
gender_range_data <- gender_diff_data %>%
  pivot_longer(., 4:5, names_to = "Dataset", values_to = "Value") %>%
  group_by(Dataset) %>% 
  summarise(lower = min(Value), upper = max(Value), p = median(Value))

print(ggplot(data = gender_range_data, aes(x = Dataset, y = p)) +
        geom_pointrange(mapping = aes(ymin = lower, ymax = upper),
                        color = "steelblue3"
                        ) +
        theme_minimal() +
        ggtitle("Female proportion range across councils") +
        # Remove axis labels
        theme(axis.title.x = element_blank(), 
        axis.title.y = element_blank()
        )
      )
```

```{r gender pay gap, echo = FALSE}
gender_data <- council_data_no_other %>%
  filter(Characteristic == "Sex" & Measure == "Female") %>%
  pivot_wider(., names_from = Measure, values_from = Percent) %>%
  select(Council, Female)

pay_gap_data <- pay_gap_data %>%
  pivot_wider(., names_from = Measure, values_from = Data) %>%
  select(-Year)
  
pay_gap_data <- merge(pay_gap_data, gender_data)

print(ggplot(pay_gap_data, aes(x = Female, y = `Gender Pay Gap`)) +
        geom_point() +
        # Add correlation line
        geom_smooth(data = pay_gap_data, 
                    aes(x = Female, y = `Gender Pay Gap`),
                    method = "lm", 
                    se = FALSE, 
                    colour= "black"
                    ) +
        # Adds correlation label
        stat_cor(data = pay_gap_data, 
                 aes(x = Female, y = `Gender Pay Gap`), 
                 method = "pearson"
                 ) 
      )
```

```{r most common age, echo = FALSE}
common_age <- council_data_formatted %>%
  filter(Characteristic == "SSCQ Age Categories") %>%
  group_by(Council) %>%
  filter(`Council Workforce` == max(`Council Workforce`, 
                                    na.rm = TRUE
                                    )
         ) %>%
  ungroup() %>%
 # group_by(Measure)
  count(Measure)
```
```{r age diff, echo = FALSE}
# Group together 3 youngest age groups and 3 oldest
# exclude 75+ as there is such a small prop in that group
age_diff_data <- workforce_pop_data %>%
  filter(Characteristic == "SSCQ Age Categories" & !Measure %in% c(
    "75+", "Age - Other (e.g. Prefer not to answer / not disclosed)")) %>%
  mutate(`Age Category` = "Younger")

age_diff_data$`Age Category`[age_diff_data$Measure == "45-54"] <- "Older"
age_diff_data$`Age Category`[age_diff_data$Measure == "55-64"] <- "Older"
age_diff_data$`Age Category`[age_diff_data$Measure == "65-74"] <- "Older"

# Sum the groups to get aggregate younger and older %
age_diff_data <- age_diff_data %>%
  group_by(Council, `Age Category`) %>%
  mutate(`Workforce Sum` = sum(`Council Workforce`, na.rm = TRUE)) %>%
  mutate(`Population Sum` = sum(`Council Population`, na.rm = TRUE)) %>%
  mutate(Diff = `Workforce Sum` - `Population Sum`)
  
younger_slice <- age_diff_data %>%
  ungroup() %>%
  filter(`Age Category` == "Younger") %>%
  # Remove these councils as they have missing age categories
  filter(!Council %in% c("Moray", 
                         "North Lanarkshire", 
                         "Scottish Borders"
                         )
         )

max_younger_slice <- younger_slice %>%
  filter(`Workforce Sum` == max(`Workforce Sum`)) %>%
  slice(1) %>%
  mutate(Criteria = "max_younger_slice")

max_diff_younger_slice <- younger_slice %>%
  filter(Diff == max(Diff)) %>%
  slice(1) %>%
  mutate(Criteria = "max_diff_younger_slice")

older_slice <- age_diff_data %>%
  ungroup() %>%
  filter(`Age Category` == "Older") %>%
  # Remove these councils as they have missing age categories
  filter(!Council %in% c("Moray", 
                         "North Lanarkshire", 
                         "Scottish Borders"
                         )
         )

max_older_slice <- older_slice %>%
  filter(`Workforce Sum` == max(`Workforce Sum`)) %>%
  slice(1) %>%
  mutate(Criteria = "max_older_slice")

max_diff_older_slice <- older_slice %>%
  filter(Diff == max(Diff)) %>%
  slice(1) %>%
  mutate(Criteria = "max_diff_older_slice")

age_diff_summary <- rbind(max_younger_slice, max_diff_younger_slice) %>%
  rbind(max_older_slice) %>%
  rbind(max_diff_older_slice) %>%
  select(Criteria, 
         Council, 
         `Age Category`, 
         `Workforce Sum`, 
         `Population Sum`, 
         Diff
         )

kable(age_diff_summary)
```

```{r max minorities, echo = FALSE}
ethnicity_max <- workforce_pop_data %>%
  filter(Characteristic == "Comparable Ethnicity Categories" & 
           Measure == "Minority Ethnicities") %>%
  filter(`Council Workforce` == max(`Council Workforce`, 
                                    na.rm = TRUE)
         ) %>%
  mutate(Criteria = "ethnicity_max")

disability_max <- workforce_pop_data %>%
  filter(Characteristic == "Disability" &
           Measure == "Yes") %>%
  filter(`Council Workforce` == max(`Council Workforce`, 
                                    na.rm = TRUE)
         ) %>%
  mutate(Criteria = "disability_max")

sexuality_max <- workforce_pop_data %>%
  filter(Characteristic == "SSCQ Sexual Orientation Categories" &
           Measure == "LGB") %>%
  filter(`Council Workforce` == max(`Council Workforce`, 
                                    na.rm = TRUE)
         ) %>%
  mutate(Criteria = "sexuality_max")

max_minorities <- rbind(ethnicity_max, disability_max) %>%
  rbind(sexuality_max)

kable(max_minorities)

```