Setup

Set knitting options

# global knitting options for automatic saving of all plots as .png and .pdf. Also sets cache directory.
knitr::opts_chunk$set(
  dev = c("png", "pdf"), fig.keep = "all",
  dev.args = list(pdf = list(encoding = "WinAnsi", useDingbats = FALSE)),
  fig.path = file.path("fig_output/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input()))),
  cache.path = file.path("cache/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input())))
)

library(tidyverse)

## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rlang)

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

library(glue)

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

library(latex2exp)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# source all relevant scripting files
source(file.path("scripts", "ampliverse_05.R"))
source(file.path("scripts", "plotting_functions.R"))

Load data

Load DADA2 output data

# taxa and sequence tables obtained from https://github.com/danote/Samail_16S_compilation
seqtab_OM19 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM19_processed_20200803.rds")
taxtab_OM19 <- read_rds("data_raw/16S_sequencing_data/taxa_OM19_processed_20200803.rds")

Load metadata

meta_map_OM19 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM19.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>% select(1:22)

Tidy up the data, concatenate taxa levels, and add metadata

ampli_data_OM19 <- ampli_tidy_dada2(seqtab_OM19, taxtab_OM19) %>% ampli_concat_tax() %>% ampli_concat_tax(tax_levels = 6, concat_tax_col_name = taxonomy_6) %>% ampli_join_metadata_map(meta_map_OM19)

Initial data examination

Read counts, full dataset

ampli_data_OM19_sum <- ampli_data_OM19 %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_data_OM19_sum %>% arrange(desc(reads_sum))

# generate summary stats of read counts
summary(ampli_data_OM19_sum %>% select(reads_sum))

##    reads_sum    
##  Min.   :   92  
##  1st Qu.:43423  
##  Median :66501  
##  Mean   :55686  
##  3rd Qu.:73624  
##  Max.   :86307

Plot read counts

Oman groundwater samples have significantly higher read counts than extraction or PCR controls, which is good.

plot_reads_sums_1 <- ampli_data_OM19_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  fill = sample_type
)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(name = "Reads") +
  scale_x_discrete(name = "Sample ID") +
  scale_fill_discrete(name = "Sample type") +
  theme_bw()+
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom"
  )

plot_reads_sums_1

# Focus on data of interest ## Filter for only desired samples

# define sample set ID's
samples_to_keep <- c("14NP", "B71")

# keep just those samples
ampli_data_OM19_focus_samples <- ampli_data_OM19 %>% ampli_filter_strings(col_to_filter = sample_id, strings_to_filter = samples_to_keep, detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads (messes up plotting later if kept)
  ampli_rm_0_read_taxa()

## 468 zero-read taxa removed from data set.

Filter out unwanted taxa

Filter out mitochondria, chloroplasts, eukaryotes, and sequences not assigned taxonomy at the the domain level

ampli_data_OM19_focus_samples_taxa_filtered <- ampli_data_OM19_focus_samples %>% ampli_filter_strings(col_to_filter = taxonomy, strings_to_filter =   c("Chloroplast", "Mitochondria", "Eukaryota", "k__NA"), detection_method = "substring", action = "remove")

Read counts, filtered dataset

Tally reads per sample

ampli_data_OM19_focus_samples_taxa_filtered_sum <- ampli_data_OM19_focus_samples_taxa_filtered %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_data_OM19_focus_samples_taxa_filtered_sum %>% arrange(desc(reads_sum))

# generate summary stats of read counts
summary(ampli_data_OM19_focus_samples_taxa_filtered_sum %>% select(reads_sum))

##    reads_sum    
##  Min.   :67256  
##  1st Qu.:68841  
##  Median :70426  
##  Mean   :70426  
##  3rd Qu.:72010  
##  Max.   :73595

Plot read counts

plot_reads_sums_2 <- ampli_data_OM19_focus_samples_taxa_filtered_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  label = reads_sum
)) +
  geom_bar(stat = "identity") +
  geom_text(nudge_y = 1200) +
  scale_x_discrete(name = "Sample ID") +
  scale_y_continuous(name = "Reads") +
  theme_bw()+
  theme(
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom",
    panel.grid = element_blank()
  )

plot_reads_sums_2

Calculate relative abundances

ampli_data_OM19_focus_samples_taxa_filtered <- ampli_data_OM19_focus_samples_taxa_filtered %>% ampli_calc_rel_abund()

ampli_data_OM19_focus_samples_taxa_filtered %>% head()

Check that relative abundances add up to 1, as expected

ampli_data_OM19_focus_samples_taxa_filtered %>% group_by(sample_id) %>% summarise(rel_abund_sum = sum(rel_abund), .groups = "drop") %>%  summary()

##   sample_id         rel_abund_sum
##  Length:2           Min.   :1    
##  Class :character   1st Qu.:1    
##  Mode  :character   Median :1    
##                     Mean   :1    
##                     3rd Qu.:1    
##                     Max.   :1

Heat map, full dataset

OM19_heat <- ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 10, y_taxa_arrangement = "abund")

OM19_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 2.8) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance /\n[%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8.124) +

  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    legend.box.spacing = unit(2, "pt"),
    legend.title = element_text(size = 7),
    plot.margin = margin(2,0,1,1)
        )

OM19_heat_tax_6 <- ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 7, y_taxa_arrangement = "abund", y_taxa_col = taxonomy_6)

OM19_heat_tax_6 +
  # plot geometry
  geom_text(parse = FALSE, size = 2.8) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance /\n[%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 9) +

  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    legend.box.spacing = unit(2, "pt"),
    legend.title = element_text(size = 7),
    plot.margin = margin(2,0,1,1)
        )

OM19_heat_tax_6_names_tbl <- ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 7, y_taxa_arrangement = "abund", y_taxa_col = taxonomy_6, return_plot_data_tbl = TRUE)

OM19_heat_tax_6_names_tbl

OM19_heat_tax_6_names_tbl_edit <- OM19_heat_tax_6_names_tbl  %>% mutate(tax_short = case_when(
  taxonomy_6 == "k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobacterium" ~ "g. $\\textit{Methanobacterium}$",
  taxonomy_6 == "k__Bacteria; p__Acetothermia; c__Acetothermiia; o__NA; f__NA; g__NA" ~ "c. Acetothermiia",
  taxonomy_6 == "k__Bacteria; p__Deinococcota; c__Deinococci; o__Thermales; f__Thermaceae; g__Meiothermus" ~ "g. $\\textit{Meiothermus}$",
  taxonomy_6 == "k__Bacteria; p__Nitrospirota; c__Thermodesulfovibrionia; o__NA; f__NA; g__NA" ~ "c. Thermodesulfovibrionia",
  taxonomy_6 == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Brachymonas" ~ "g. $\\textit{Brachymonas}$",
  taxonomy_6 == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Hydrogenophaga" ~ "g. $\\textit{Hydrogenophaga}$",
  taxonomy_6 == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Silanimonas" ~ "g. $\\textit{Silanimonas}$",
  taxonomy_6 == "Other taxa" ~ "Other taxa"
))

OM19_heat_tax_6_names_tbl_edit

OM19_heat_tax_6 <- ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 7, y_taxa_arrangement = "abund", y_taxa_col = taxonomy_6, custom_taxa_names_tbl = OM19_heat_tax_6_names_tbl_edit, custom_taxa_names_col = tax_short,)

OM19_heat_tax_6 +
  # plot geometry
  geom_text(parse = FALSE) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance /\n[%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Well", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment\n(maximum of genus)", expand = c(0,0), labels = TeX) +

  theme_bw(base_size = 10) +

  theme(
    legend.position = "right",
    panel.grid = element_blank(),
    legend.box.spacing = unit(0, "pt"),
    legend.box.margin = margin(1,1,1,1),
    plot.margin = margin(2,-5,1,1),
    legend.title = element_text(size = 9)
        )

OM19 CH4 taxa

CH4_taxa <- c("Methanobacteri", "Methanomicrobia", "Methanopyrales", "Methanocellales", "Methanoplasmatales", "Methanosarcinales", "Methanomassiliicocc", "Methylococc", "Methylocystis", "Methylosinus", "Methylocella", "Methylocapsa", "Methylacidiphil", "Methylomirabilis", "ANME")

ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 1, text_label_format = "scientific") +
  # plot geometry
  geom_text(parse = TRUE, size = 3) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "right",
    panel.grid = element_blank()
        )

## Warning: `as_character()` is deprecated as of rlang 0.4.0
## Please use `vctrs::vec_cast()` instead.
## This warning is displayed once per session.

ampli_data_OM19_focus_samples_taxa_filtered %>%
  ampli_heat_map(y_taxa_col = Genus,  x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round") +
  # plot geometry
  geom_text(parse = FALSE, size = 3) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "right",
    panel.grid = element_blank()
        )

Oman 2019 16S rRNA gene amplicon sequencing data processing

Source file: OM19_16S_processing.Rmd

Daniel Nothaft

2020-09-18