Setup

Set knitting options

# global knitting options for automatic saving of all plots as .png and .pdf. Also sets cache directory.
knitr::opts_chunk$set(
  dev = c("png", "pdf"), fig.keep = "all",
  dev.args = list(pdf = list(encoding = "WinAnsi", useDingbats = FALSE)),
  fig.path = file.path("fig_output/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input()))),
  cache.path = file.path("cache/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input())))
)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(rlang)

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

library(glue)

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

library(latex2exp)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

# source all relevant scripting files
source(file.path("scripts", "ampliverse_07.R"))
source(file.path("scripts", "plotting_functions.R"))

Load data

Load DADA2 output data

# taxa and sequence tables obtained from https://github.com/danote/Samail_16S_compilation
seqtab_OM17 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM17_processed_20200929_3.rds")
taxtab_OM17 <- read_rds("data_raw/16S_sequencing_data/taxa_OM17_processed_20200929_2.rds")

seqtab_OM18 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM18_processed_20200929.rds")
taxtab_OM18 <- read_rds("data_raw/16S_sequencing_data/taxa_OM18_processed_20200929.rds")

seqtab_OM19 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM19_processed_20200803.rds")
taxtab_OM19 <- read_rds("data_raw/16S_sequencing_data/taxa_OM19_processed_20200803.rds")

Load metadata

meta_map_OM17 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM17.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>%
  # select columns 1 through 22 because read_delim is generating some extra dummy columns (X23, X24, etc.)
  # (there are only 22 actual data columns in the spreadsheet)
  select(1:22)

## New names:
## * `` -> ...23
## * `` -> ...24
## * `` -> ...25
## * `` -> ...26
## * `` -> ...27
## * ...

meta_map_OM18 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM18.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>% select(1:22)

meta_map_OM19 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM19.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>% select(1:22)

Tidy up the data, concatenate taxa levels, and add metadata

ampli_data_OM17 <- ampli_tidy_dada2(seqtab_OM17, taxtab_OM17) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM17)

ampli_OM17_thru_19 <- ampli_tidy_dada2(seqtab_OM18, taxtab_OM18) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM18)

ampli_data_OM19 <- ampli_tidy_dada2(seqtab_OM19, taxtab_OM19) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM19)

Combine datasets.

ampli_OM17_thru_19 <- ampli_join_ampli_tibbles(lst(ampli_data_OM17, ampli_OM17_thru_19, ampli_data_OM19))

## Joining, by = c("sample_id", "sequence", "reads", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "taxonomy", "barcode_sequence", "forward_linker_primer_sequence", "reverse_primer_sequence", "sample_type", "nucleic_acid_type", "sampling_site", "year_sampled", "month_sampled", "day_sampled", "depth_fluid_intake_mbct", "notes", "sampling_method", "upper_packer_inflated", "upper_packer_depth_mbct", "lower_packer_inflated", "lower_packer_depth_mbct", "well_depth_mbgl", "casing_extent_mbct", "casing_height_magl", "screened_interval_mbct", "depth_to_water_mbct")
## Joining, by = c("sample_id", "sequence", "reads", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "taxonomy", "barcode_sequence", "forward_linker_primer_sequence", "reverse_primer_sequence", "sample_type", "nucleic_acid_type", "sampling_site", "year_sampled", "month_sampled", "day_sampled", "depth_fluid_intake_mbct", "notes", "sampling_method", "upper_packer_inflated", "upper_packer_depth_mbct", "lower_packer_inflated", "lower_packer_depth_mbct", "well_depth_mbgl", "casing_extent_mbct", "casing_height_magl", "screened_interval_mbct", "depth_to_water_mbct")

Initial data examination

Read counts, full dataset

ampli_OM17_thru_19_sum <- ampli_OM17_thru_19 %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_OM17_thru_19_sum %>% arrange(desc(reads_sum))

# generate summary stats of read counts
summary(ampli_OM17_thru_19_sum %>% select(reads_sum))

##    reads_sum      
##  Min.   :    0.0  
##  1st Qu.:  903.2  
##  Median : 5864.0  
##  Mean   :11758.8  
##  3rd Qu.:15034.8  
##  Max.   :86307.0

Plot read counts

Oman groundwater samples have significantly higher read counts than extraction or PCR controls, which is good.

plot_reads_sums_1 <- ampli_OM17_thru_19_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  fill = sample_type
)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(name = "Reads") +
  scale_x_discrete(name = "Sample ID") +
  scale_fill_discrete(name = "Sample type") +
  theme_bw(base_size = 7)+
  facet_grid(cols = vars(year_sampled), scales = "free", space = "free") +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom"
  )

plot_reads_sums_1

Focus on data of interest

Filter for only desired samples

# keep just those samples
ampli_OM17_thru_19_focus_samples <- ampli_OM17_thru_19 %>% ampli_filter_strings(col_to_filter = sampling_site, strings_to_filter = c("BA1A", "BA1D"), detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads (messes up plotting later if kept)
  ampli_rm_0_read_taxa()

## 7961 zero-read taxa removed from data set.

Filter out unwanted taxa

Filter out mitochondria, chloroplasts, eukaryotes, and sequences not assigned taxonomy at the the domain level

ampli_OM17_thru_19_focus_samples_taxa_filtered <- ampli_OM17_thru_19_focus_samples %>% ampli_filter_strings(col_to_filter = taxonomy, strings_to_filter =   c("Chloroplast", "Mitochondria", "Eukaryota", "k__NA"), detection_method = "substring", action = "remove")

Read counts, filtered dataset

Tally reads per sample

ampli_OM17_thru_19_focus_samples_taxa_filtered_sum <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% arrange(desc(reads_sum))

# generate summary stats of read counts
summary(ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% select(reads_sum))

##    reads_sum    
##  Min.   :  106  
##  1st Qu.: 1701  
##  Median :25811  
##  Mean   :25583  
##  3rd Qu.:37294  
##  Max.   :86183

Plot read counts

plot_reads_sums_2 <- ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  label = reads_sum
)) +
  geom_bar(stat = "identity") +
  geom_text(nudge_y = 2500, size = 2.5) +
  scale_y_continuous(name = "Reads") +
  scale_x_discrete(name = "Sample ID") +
  scale_fill_discrete(name = "Sample type") +
  theme_bw(base_size = 11)+
  facet_grid(cols = vars(year_sampled), scales = "free", space = "free") +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom"
  )

plot_reads_sums_2

Calculate relative abundances

ampli_OM17_thru_19_focus_samples_taxa_filtered <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_calc_rel_abund()

ampli_OM17_thru_19_focus_samples_taxa_filtered %>% head()

Check that relative abundances add up to 1, as expected

ampli_OM17_thru_19_focus_samples_taxa_filtered %>% group_by(sample_id) %>% summarise(rel_abund_sum = sum(rel_abund), .groups = "drop") %>%  summary()

##   sample_id         rel_abund_sum
##  Length:21          Min.   :1    
##  Class :character   1st Qu.:1    
##  Mode  :character   Median :1    
##                     Mean   :1    
##                     3rd Qu.:1    
##                     Max.   :1

Heat map, full dataset

OM17_thru_19_heat <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sample_id, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampled)

OM17_thru_19_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 2.25) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    plot.margin =  margin(1,1,1,1)
        )

BA1 pumped samples with 0.2 µm filter pore diameter

Filter for only desired samples

# keep just those samples
ampli_packer_main_text_taxa_filtered <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_filter_strings(col_to_filter = sample_id, strings_to_filter = c("BA1A100_22_2D", "BA1A5566_22_1C", "A108", "A30", "A41", "D102", "D45"), detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads (messes up plotting later if kept)
  ampli_rm_0_read_taxa()

## 219 zero-read taxa removed from data set.

# if packers were used, generate names for packed intervals
# in this code, the packer sample sites have their intervals separated by a new line character. This is for plotting clarity.
ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(site_packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, sampling_site, paste0(sampling_site, "\n", if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))))

ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(year_sampling_site = paste(year_sampled, sampling_site))

ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))))

add factor levels for plot organization

ampli_packer_main_text_taxa_filtered$packer_interval <- factor(ampli_packer_main_text_taxa_filtered$packer_interval, levels = c(
  "0-30",
  "41-65",
  "45-75",
  "55-66",
  "100-400",
  "102-132",
  "108-132"
))

Overall taxonomy

Full names

packer_main_text_heat <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 25, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site)

packer_main_text_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] grouped by\nyear of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 8.5) +

  theme(
    panel.grid = element_blank(),
    legend.position = "bottom",
    plot.margin =  margin(1,1,1,1)
        )

Shortened names

packer_main_text_heat_names_tbl <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 23, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site, return_plot_data_tbl = TRUE)

packer_main_text_heat_names_tbl

packer_main_text_heat_names_taxonomy_vector <- packer_main_text_heat_names_tbl %>%  group_by(taxonomy) %>% summarise(taxonomy = first(taxonomy)) %>% mutate(taxonomy_char = as.character(taxonomy)) %>% pull(taxonomy_char)

packer_main_text_heat_names_taxonomy_vector

##  [1] "Other taxa"                                                                                                                        
##  [2] "k__Bacteria; p__Firmicutes; c__Dethiobacteria; o__Dethiobacterales; f__Dethiobacteraceae; g__Dethiobacter; s__NA"                  
##  [3] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Hydrogenophilaceae; g__Thiobacillus; s__thioparus"  
##  [4] "k__Bacteria; p__Bacteroidota; c__Kryptonia; o__Kryptoniales; f__BSV26; g__NA; s__NA"                                               
##  [5] "k__Bacteria; p__Patescibacteria; c__ABY1; o__Candidatus_Magasanikbacteria; f__NA; g__NA; s__NA"                                    
##  [6] "k__Bacteria; p__Acetothermia; c__Acetothermiia; o__NA; f__NA; g__NA; s__NA"                                                        
##  [7] "k__Bacteria; p__Nitrospirota; c__Nitrospiria; o__Nitrospirales; f__Nitrospiraceae; g__Nitrospira; s__NA"                           
##  [8] "k__Bacteria; p__Patescibacteria; c__Parcubacteria; o__NA; f__NA; g__NA; s__NA"                                                     
##  [9] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Azohydromonas; s__NA"            
## [10] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__NA; s__NA"                     
## [11] "k__Bacteria; p__Patescibacteria; c__CPR2; o__NA; f__NA; g__NA; s__NA"                                                              
## [12] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Silanimonas; s__NA"            
## [13] "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus_Nitrososphaera; s__NA"
## [14] "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Opitutales; f__Opitutaceae; g__Lacunisphaera; s__NA"                    
## [15] "k__Bacteria; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA"                                                                             
## [16] "k__Bacteria; p__Bacteroidota; c__Ignavibacteria; o__Ignavibacteriales; f__SM1H02; g__NA; s__NA"                                    
## [17] "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Candidatus_Nitrosotenuis; s__NA"   
## [18] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Sutterellaceae; g__NA; s__NA"                       
## [19] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__NA; f__NA; g__NA; s__NA"                                                
## [20] "k__Bacteria; p__GAL15; c__NA; o__NA; f__NA; g__NA; s__NA"                                                                          
## [21] "k__Bacteria; p__Deinococcota; c__Deinococci; o__Thermales; f__Thermaceae; g__Meiothermus; s__NA"                                   
## [22] "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Parvibaculales; f__Parvibaculaceae; g__Parvibaculum; s__lavamentivorans"
## [23] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Brachymonas; s__NA"              
## [24] "k__Bacteria; p__Nitrospirota; c__Thermodesulfovibrionia; o__NA; f__NA; g__NA; s__NA"

ampli_packer_main_text_taxa_filtered %>% ampli_filter_strings(col_to_filter = taxonomy, strings_to_filter = packer_main_text_heat_names_taxonomy_vector, detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads
  ampli_rm_0_read_taxa() %>% group_by(taxonomy) %>% summarise(Kingdom = first(Kingdom), Phylum = first(Phylum), Class = first(Class), Order = first(Order), Family = first(Family), Genus = first(Genus), Species = first(Species))

## 1 zero-read taxa removed from data set.

packer_main_text_heat_names_tbl_edit <- packer_main_text_heat_names_tbl  %>% mutate(tax_short = case_when(
  taxonomy == "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Candidatus_Nitrosotenuis; s__NA" ~ "g. \\textit{Ca.} Nitrosotenuis",
  taxonomy == "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus_Nitrososphaera; s__NA" ~ "g. \\textit{Ca.} Nitrososphaera",
  taxonomy == "k__Bacteria; p__Acetothermia; c__Acetothermiia; o__NA; f__NA; g__NA; s__NA" ~ "c. Acetothermiia",
  taxonomy == "k__Bacteria; p__Bacteroidota; c__Ignavibacteria; o__Ignavibacteriales; f__SM1H02; g__NA; s__NA" ~ "f. SM1H02",
  taxonomy == "k__Bacteria; p__Bacteroidota; c__Kryptonia; o__Kryptoniales; f__BSV26; g__NA; s__NA" ~ "f. BSV26",
  taxonomy == "k__Bacteria; p__Chloroflexi; c__P2-11E; o__NA; f__NA; g__NA; s__NA" ~ "c. P2-11E",
  taxonomy == "k__Bacteria; p__Deinococcota; c__Deinococci; o__Thermales; f__Thermaceae; g__Meiothermus; s__NA" ~ "g. \\textit{Meiothermus}",
  taxonomy == "k__Bacteria; p__Firmicutes; c__Dethiobacteria; o__Dethiobacterales; f__Dethiobacteraceae; g__Dethiobacter; s__NA" ~ "g. \\textit{Dethiobacter}",
  taxonomy == "k__Bacteria; p__GAL15; c__NA; o__NA; f__NA; g__NA; s__NA" ~ "p. GAL15",
  taxonomy == "k__Bacteria; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA" ~ "k. Bacteria",
  taxonomy == "k__Bacteria; p__Nitrospirota; c__Nitrospiria; o__Nitrospirales; f__Nitrospiraceae; g__Nitrospira; s__NA" ~ "g. \\textit{Nitrospira}",
  taxonomy == "k__Bacteria; p__Nitrospirota; c__Thermodesulfovibrionia; o__NA; f__NA; g__NA; s__NA" ~ "c. Thermodesulfovibrionia",
  taxonomy == "k__Bacteria; p__Patescibacteria; c__ABY1; o__Candidatus_Magasanikbacteria; f__NA; g__NA; s__NA" ~ "o. \\textit{Ca.} Magasanikbacteria",  
  taxonomy == "k__Bacteria; p__Patescibacteria; c__CPR2; o__NA; f__NA; g__NA; s__NA" ~ "c. CPR2",
  taxonomy == "k__Bacteria; p__Patescibacteria; c__Parcubacteria; o__NA; f__NA; g__NA; s__NA" ~ "c. Parcubacteria",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Parvibaculales; f__Parvibaculaceae; g__Parvibaculum; s__lavamentivorans" ~ "s. \\textit{lavamentivorans}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Azohydromonas; s__NA" ~ "g. \\textit{Azohydromonas}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Brachymonas; s__NA" ~ "g. \\textit{Brachymonas}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Hydrogenophilaceae; g__Thiobacillus; s__thioparus" ~ "s. \\textit{thioparus}",  
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Sutterellaceae; g__NA; s__NA" ~ "f. Sutterellaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__NA; f__NA; g__NA; s__NA" ~ "c. Gammaproteobacteria",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__NA; s__NA" ~ "f. Xanthomonadaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Silanimonas; s__NA" ~ "g. \\textit{Silanimonas}",
  taxonomy == "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Opitutales; f__Opitutaceae; g__Lacunisphaera; s__NA" ~ "g. \\textit{Lacunisphaera}",
  taxonomy == "Other taxa" ~ "Other taxa"))

packer_main_text_heat_names_tbl_edit

packer_main_text_heat <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 23, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site, custom_taxa_names_tbl = packer_main_text_heat_names_tbl_edit, custom_taxa_names_col = tax_short)

packer_main_text_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 3) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] grouped by\nyear of sampling and well name", expand = c(0,0)) +
    scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0), labels = TeX) +

  theme_bw(base_size = 11) +

  theme(
    panel.grid = element_blank(),
    # legend.position = "bottom",
    plot.margin =  margin(1,1,1,1)
        )

2018 BA1A filter size fractions

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled == 2018) %>% 
  
  # add column for packer interval
mutate(packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct)))) %>% 
         # add column for filter pore diameter
mutate(filter_diam = 
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "0.45",
  TRUE ~ "0.22"
)) %>% 
  
  ampli_heat_map(x_sample_group_col = filter_diam, plot_other_taxa_bin = TRUE, top_n = 20,  y_taxa_arrangement = "abund", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", facet_grid_sample_group_col = packer_interval) +

  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Filter pore diameter / [µm] grouped by\ndepth of sampling interval / [m] ", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 9) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

CH\(_4\) taxa

CH4_taxa <- c("Methanobacteria", "Methanomicrobia", "Methanopyrales", "Methanocellales", "Methanoplasmatales", "Methanosarcinales", "Methanomassiliicocc", "Methylococc", "Methylocystis", "Methylosinus", "Methylocella", "Methylocapsa", "Methylacidiphil", "Methylomirabilis", "ANME")

CH4_plotted_taxa_names_tbl <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled > 2017) %>% 
# add column combining year and well
mutate(year_sampling_site = paste(year_sampled, sampling_site)) %>% 
# add column combining site, packer_interval, and filter pore diamter
mutate(packer_interval_filter_diam = paste(if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))),# "\n0.22 µm")
                                          
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "\n0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "\n0.45",
  TRUE ~ "\n0.22"
))
) %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, facet_grid_sample_group_col = year_sampling_site, return_plot_data_tbl = TRUE)

## Warning: `as_character()` is deprecated as of rlang 0.4.0
## Please use `vctrs::vec_cast()` instead.
## This warning is displayed once per session.

CH4_plotted_taxa_names_tbl

CH4_plotted_taxa_names_tbl_edit <- CH4_plotted_taxa_names_tbl  %>% mutate(tax_short = case_when(
  taxonomy == "k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobacterium; s__NA" ~ "g. $\\textit{Methanobacterium}$",
  taxonomy == "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Methylacidiphilales; f__Methylacidiphilaceae; g__NA; s__NA" ~ "f. Methylacidiphilaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Methylococcales; f__Methylococcaceae; g__Methylocaldum; s__NA" ~ "g. \\textit{Methylocaldum}"
))

CH4_plotted_taxa_names_tbl_edit

CH4_plotted_taxa_names_tbl_edit %>% group_by(year_sampling_site, packer_interval_filter_diam) %>% summarise()

## `summarise()` has grouped output by 'year_sampling_site'. You can override using the `.groups` argument.

ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled > 2017) %>% 
# add column combining year and well
mutate(year_sampling_site = paste(year_sampled, sampling_site)) %>% 
# add column combining site, packer_interval, and filter pore diameter
mutate(packer_interval_filter_diam = paste(if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))),# "\n0.22 µm")
                                          
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "\n0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "\n0.45",
  TRUE ~ "\n0.22"
))
)

# sort levels for plotting
ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam$packer_interval_filter_diam <- factor(ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam$packer_interval_filter_diam, levels = c(
  "55-66 \n0.10",
  "55-66 \n0.22",
  "55-66 \n0.45",
  "100-400 \n0.10",
  "100-400 \n0.22",
  "100-400 \n0.45",
  "0-30 \n0.22",
  "41-65 \n0.22",
  "108-132 \n0.22",
  "45-75 \n0.22",
  "102-132 \n0.22"
))

ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam  %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, facet_grid_sample_group_col = year_sampling_site, custom_taxa_names_tbl = CH4_plotted_taxa_names_tbl_edit, custom_taxa_names_col = tax_short) +

  # plot geometry
  geom_text(parse = FALSE) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "white", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] and filter pore diameter / [µm]\ngrouped by year of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest\ntaxonomic assignment", expand = c(0,0), labels = TeX) +

  theme_bw(base_size = 10.5) +

  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

\(\text{C}_{2}\)-\(\text{C}_{6}\) Short-Chain Alkane (SCA)-consuming taxa

# input SCA-consuming taxa from Singh et al. (2017), Laso-Pérez et al. (2019), and Shennan et al. (2006)

all_SCA_taxa_oman <- c("BuS5", "Syntrophoarchaeum", "HotSeep-1", "Argoarchaeum", "ethanivorans", "Methanoliparia", "GoM", "Actinomyces", "Arthrobacter", "Brevibacterium", "Corynebacterium", "Gordonia", "Mycobacterium", "Nocardia", "Nocardiodes", "Rhodococcus", "Acinetobacter", "cepacia", "Pseudomonas", "Ralstonia")

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = all_SCA_taxa_oman, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 1, text_label_format = "scientific") +
  # plot geometry
  geom_text(parse = TRUE, size = 2.1) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 6.5) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank()
        )

There were no species-level assignments to known short-chain alkane oxidizing taxa. The genera shown above contain organisms capable of many metabolic processes unrelated to short-chain alkane oxidation, so their presence is not strong evidence for microbial short-chain alkane oxidation.

Hydrogenophaga

hydrogenophaga_taxa_oman <- c("Hydrogenophaga")

ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = hydrogenophaga_taxa_oman, plot_other_taxa_bin = FALSE, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 50, facet_grid_sample_group_col = year_sampling_site, y_taxa_col = Genus) +

  # plot geometry
  geom_text(parse = FALSE) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "white", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] and filter pore diameter / [µm]\ngrouped by year of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 11) +

  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
        )

## Warning: Unknown levels in `f`: Other taxa

2017 drill foam samples

All taxa

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # filter for drill foam samples
  filter(sample_type == "drill foam emerged from well") %>% 
  ampli_heat_map(x_sample_group_col = sample_id, plot_other_taxa_bin = TRUE, top_n = 20,  y_taxa_arrangement = "abund", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", facet_grid_sample_group_col = nucleic_acid_type) +

  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Replicate grouped by nucleic acid", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

Brachymonas and Parvibaculum

Brachymonas and Parvibaculum were not detected. I also included Pseudomonas in the search here just so that the plot would still have an output.

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # filter for drill foam samples
  filter(sample_type == "drill foam emerged from well") %>%
  ampli_heat_map(x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = c("Brachymonas", "Parvibaculum", "Pseudomonas"), plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 1, text_label_format = "scientific") +
  # plot geometry
  geom_text(parse = TRUE, size = 2.1) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank()
        )

BA1 16S rRNA gene amplicon sequencing data processing

Source file: BA1_16S_processing.Rmd

Daniel Nothaft

2021-07-21

Setup

Load data

Tidy up the data, concatenate taxa levels, and add metadata

Combine datasets.

Initial data examination

Read counts, full dataset

Focus on data of interest

Filter for only desired samples

Filter out unwanted taxa

Read counts, filtered dataset

Calculate relative abundances

Heat map, full dataset

BA1 pumped samples with 0.2 µm filter pore diameter

Filter for only desired samples

Overall taxonomy

Full names

Shortened names

2018 BA1A filter size fractions

CH\(_4\) taxa

\(\text{C}_{2}\)-\(\text{C}_{6}\) Short-Chain Alkane (SCA)-consuming taxa

Hydrogenophaga

2017 drill foam samples

All taxa

Brachymonas and Parvibaculum