Setup

Set knitting options

# global knitting options for automatic saving of all plots as .png and .pdf. Also sets cache directory.
knitr::opts_chunk$set(
  dev = c("png", "pdf"), fig.keep = "all",
  dev.args = list(pdf = list(encoding = "WinAnsi", useDingbats = FALSE)),
  fig.path = file.path("fig_output/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input()))),
  cache.path = file.path("cache/", paste0(gsub("\\.[Rr]md", "/", knitr::current_input())))
)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(latex2exp)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# source all relevant scripting files
source(file.path("scripts", "ampliverse_07.R"))
source(file.path("scripts", "plotting_functions.R"))

Load data

Load DADA2 output data

# taxa and sequence tables obtained from https://github.com/danote/Samail_16S_compilation
seqtab_OM17 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM17_processed_20200929_3.rds")
taxtab_OM17 <- read_rds("data_raw/16S_sequencing_data/taxa_OM17_processed_20200929_2.rds")

seqtab_OM18 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM18_processed_20200929.rds")
taxtab_OM18 <- read_rds("data_raw/16S_sequencing_data/taxa_OM18_processed_20200929.rds")

seqtab_OM19 <- read_rds("data_raw/16S_sequencing_data/seqtab_nochim_OM19_processed_20200803.rds")
taxtab_OM19 <- read_rds("data_raw/16S_sequencing_data/taxa_OM19_processed_20200803.rds")

Load metadata

meta_map_OM17 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM17.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>%
  # select columns 1 through 22 because read_delim is generating some extra dummy columns (X23, X24, etc.)
  # (there are only 22 actual data columns in the spreadsheet)
  select(1:22)
## New names:
## * `` -> ...23
## * `` -> ...24
## * `` -> ...25
## * `` -> ...26
## * `` -> ...27
## * ...
meta_map_OM18 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM18.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>% select(1:22)

meta_map_OM19 <- read_delim("data_raw/16S_sequencing_data/map_for_compilation_OM19.txt", delim = "\t",
                            col_types = cols(
  sample_id = col_character(),
  barcode_sequence = col_character(),
  forward_linker_primer_sequence = col_character(),
  reverse_primer_sequence = col_character(),
  sample_type = col_character(),
  nucleic_acid_type = col_character(),
  sampling_site = col_character(),
  year_sampled = col_double(),
  month_sampled = col_double(),
  day_sampled = col_double(),
  depth_fluid_intake_mbct = col_double(),
  notes = col_character(),
  sampling_method = col_character(),
  upper_packer_inflated = col_logical(),
  upper_packer_depth_mbct = col_double(),
  lower_packer_inflated = col_logical(),
  lower_packer_depth_mbct = col_double(),
  well_depth_mbgl = col_double(),
  casing_extent_mbct = col_double(),
  casing_height_magl = col_double(),
  screened_interval_mbct = col_character(),
  depth_to_water_mbct = col_double()
)
                            ) %>% select(1:22)

Tidy up the data, concatenate taxa levels, and add metadata

ampli_data_OM17 <- ampli_tidy_dada2(seqtab_OM17, taxtab_OM17) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM17)

ampli_OM17_thru_19 <- ampli_tidy_dada2(seqtab_OM18, taxtab_OM18) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM18)

ampli_data_OM19 <- ampli_tidy_dada2(seqtab_OM19, taxtab_OM19) %>% ampli_concat_tax() %>% ampli_join_metadata_map(meta_map_OM19)

Combine datasets.

ampli_OM17_thru_19 <- ampli_join_ampli_tibbles(lst(ampli_data_OM17, ampli_OM17_thru_19, ampli_data_OM19))
## Joining, by = c("sample_id", "sequence", "reads", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "taxonomy", "barcode_sequence", "forward_linker_primer_sequence", "reverse_primer_sequence", "sample_type", "nucleic_acid_type", "sampling_site", "year_sampled", "month_sampled", "day_sampled", "depth_fluid_intake_mbct", "notes", "sampling_method", "upper_packer_inflated", "upper_packer_depth_mbct", "lower_packer_inflated", "lower_packer_depth_mbct", "well_depth_mbgl", "casing_extent_mbct", "casing_height_magl", "screened_interval_mbct", "depth_to_water_mbct")
## Joining, by = c("sample_id", "sequence", "reads", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "taxonomy", "barcode_sequence", "forward_linker_primer_sequence", "reverse_primer_sequence", "sample_type", "nucleic_acid_type", "sampling_site", "year_sampled", "month_sampled", "day_sampled", "depth_fluid_intake_mbct", "notes", "sampling_method", "upper_packer_inflated", "upper_packer_depth_mbct", "lower_packer_inflated", "lower_packer_depth_mbct", "well_depth_mbgl", "casing_extent_mbct", "casing_height_magl", "screened_interval_mbct", "depth_to_water_mbct")

Initial data examination

Read counts, full dataset

ampli_OM17_thru_19_sum <- ampli_OM17_thru_19 %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_OM17_thru_19_sum %>% arrange(desc(reads_sum))
# generate summary stats of read counts
summary(ampli_OM17_thru_19_sum %>% select(reads_sum))
##    reads_sum      
##  Min.   :    0.0  
##  1st Qu.:  903.2  
##  Median : 5864.0  
##  Mean   :11758.8  
##  3rd Qu.:15034.8  
##  Max.   :86307.0

Plot read counts

Oman groundwater samples have significantly higher read counts than extraction or PCR controls, which is good.

plot_reads_sums_1 <- ampli_OM17_thru_19_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  fill = sample_type
)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(name = "Reads") +
  scale_x_discrete(name = "Sample ID") +
  scale_fill_discrete(name = "Sample type") +
  theme_bw(base_size = 7)+
  facet_grid(cols = vars(year_sampled), scales = "free", space = "free") +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom"
  )

plot_reads_sums_1

Focus on data of interest

Filter for only desired samples

# keep just those samples
ampli_OM17_thru_19_focus_samples <- ampli_OM17_thru_19 %>% ampli_filter_strings(col_to_filter = sampling_site, strings_to_filter = c("BA1A", "BA1D"), detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads (messes up plotting later if kept)
  ampli_rm_0_read_taxa()
## 7961 zero-read taxa removed from data set.

Filter out unwanted taxa

Filter out mitochondria, chloroplasts, eukaryotes, and sequences not assigned taxonomy at the the domain level

ampli_OM17_thru_19_focus_samples_taxa_filtered <- ampli_OM17_thru_19_focus_samples %>% ampli_filter_strings(col_to_filter = taxonomy, strings_to_filter =   c("Chloroplast", "Mitochondria", "Eukaryota", "k__NA"), detection_method = "substring", action = "remove")

Read counts, filtered dataset

Tally reads per sample

ampli_OM17_thru_19_focus_samples_taxa_filtered_sum <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_tally_reads(c("year_sampled","sample_type"))

# sort by read counts
ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% arrange(desc(reads_sum))
# generate summary stats of read counts
summary(ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% select(reads_sum))
##    reads_sum    
##  Min.   :  106  
##  1st Qu.: 1701  
##  Median :25811  
##  Mean   :25583  
##  3rd Qu.:37294  
##  Max.   :86183

Plot read counts

plot_reads_sums_2 <- ampli_OM17_thru_19_focus_samples_taxa_filtered_sum %>% ggplot(aes(
  x = fct_reorder(sample_id, desc(reads_sum)),
  y = reads_sum,
  label = reads_sum
)) +
  geom_bar(stat = "identity") +
  geom_text(nudge_y = 2500, size = 2.5) +
  scale_y_continuous(name = "Reads") +
  scale_x_discrete(name = "Sample ID") +
  scale_fill_discrete(name = "Sample type") +
  theme_bw(base_size = 11)+
  facet_grid(cols = vars(year_sampled), scales = "free", space = "free") +
  theme(
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = .5, hjust = 1),
    legend.position = "bottom"
  )

plot_reads_sums_2

Calculate relative abundances

ampli_OM17_thru_19_focus_samples_taxa_filtered <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_calc_rel_abund()

ampli_OM17_thru_19_focus_samples_taxa_filtered %>% head()

Check that relative abundances add up to 1, as expected

ampli_OM17_thru_19_focus_samples_taxa_filtered %>% group_by(sample_id) %>% summarise(rel_abund_sum = sum(rel_abund), .groups = "drop") %>%  summary()
##   sample_id         rel_abund_sum
##  Length:21          Min.   :1    
##  Class :character   1st Qu.:1    
##  Mode  :character   Median :1    
##                     Mean   :1    
##                     3rd Qu.:1    
##                     Max.   :1

Heat map, full dataset

OM17_thru_19_heat <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sample_id, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampled)

OM17_thru_19_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 2.25) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
    plot.margin =  margin(1,1,1,1)
        )

BA1 pumped samples with 0.2 µm filter pore diameter

Filter for only desired samples

# keep just those samples
ampli_packer_main_text_taxa_filtered <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>% ampli_filter_strings(col_to_filter = sample_id, strings_to_filter = c("BA1A100_22_2D", "BA1A5566_22_1C", "A108", "A30", "A41", "D102", "D45"), detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads (messes up plotting later if kept)
  ampli_rm_0_read_taxa()
## 219 zero-read taxa removed from data set.
# if packers were used, generate names for packed intervals
# in this code, the packer sample sites have their intervals separated by a new line character. This is for plotting clarity.
ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(site_packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, sampling_site, paste0(sampling_site, "\n", if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))))

ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(year_sampling_site = paste(year_sampled, sampling_site))

ampli_packer_main_text_taxa_filtered <- ampli_packer_main_text_taxa_filtered %>% mutate(packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))))

add factor levels for plot organization

ampli_packer_main_text_taxa_filtered$packer_interval <- factor(ampli_packer_main_text_taxa_filtered$packer_interval, levels = c(
  "0-30",
  "41-65",
  "45-75",
  "55-66",
  "100-400",
  "102-132",
  "108-132"
))

Overall taxonomy

Full names

packer_main_text_heat <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 25, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site)

packer_main_text_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] grouped by\nyear of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 8.5) +

  theme(
    panel.grid = element_blank(),
    legend.position = "bottom",
    plot.margin =  margin(1,1,1,1)
        )

Shortened names

packer_main_text_heat_names_tbl <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 23, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site, return_plot_data_tbl = TRUE)

packer_main_text_heat_names_tbl 
packer_main_text_heat_names_taxonomy_vector <- packer_main_text_heat_names_tbl %>%  group_by(taxonomy) %>% summarise(taxonomy = first(taxonomy)) %>% mutate(taxonomy_char = as.character(taxonomy)) %>% pull(taxonomy_char)

packer_main_text_heat_names_taxonomy_vector
##  [1] "Other taxa"                                                                                                                        
##  [2] "k__Bacteria; p__Firmicutes; c__Dethiobacteria; o__Dethiobacterales; f__Dethiobacteraceae; g__Dethiobacter; s__NA"                  
##  [3] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Hydrogenophilaceae; g__Thiobacillus; s__thioparus"  
##  [4] "k__Bacteria; p__Bacteroidota; c__Kryptonia; o__Kryptoniales; f__BSV26; g__NA; s__NA"                                               
##  [5] "k__Bacteria; p__Patescibacteria; c__ABY1; o__Candidatus_Magasanikbacteria; f__NA; g__NA; s__NA"                                    
##  [6] "k__Bacteria; p__Acetothermia; c__Acetothermiia; o__NA; f__NA; g__NA; s__NA"                                                        
##  [7] "k__Bacteria; p__Nitrospirota; c__Nitrospiria; o__Nitrospirales; f__Nitrospiraceae; g__Nitrospira; s__NA"                           
##  [8] "k__Bacteria; p__Patescibacteria; c__Parcubacteria; o__NA; f__NA; g__NA; s__NA"                                                     
##  [9] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Azohydromonas; s__NA"            
## [10] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__NA; s__NA"                     
## [11] "k__Bacteria; p__Patescibacteria; c__CPR2; o__NA; f__NA; g__NA; s__NA"                                                              
## [12] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Silanimonas; s__NA"            
## [13] "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus_Nitrososphaera; s__NA"
## [14] "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Opitutales; f__Opitutaceae; g__Lacunisphaera; s__NA"                    
## [15] "k__Bacteria; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA"                                                                             
## [16] "k__Bacteria; p__Bacteroidota; c__Ignavibacteria; o__Ignavibacteriales; f__SM1H02; g__NA; s__NA"                                    
## [17] "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Candidatus_Nitrosotenuis; s__NA"   
## [18] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Sutterellaceae; g__NA; s__NA"                       
## [19] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__NA; f__NA; g__NA; s__NA"                                                
## [20] "k__Bacteria; p__GAL15; c__NA; o__NA; f__NA; g__NA; s__NA"                                                                          
## [21] "k__Bacteria; p__Deinococcota; c__Deinococci; o__Thermales; f__Thermaceae; g__Meiothermus; s__NA"                                   
## [22] "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Parvibaculales; f__Parvibaculaceae; g__Parvibaculum; s__lavamentivorans"
## [23] "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Brachymonas; s__NA"              
## [24] "k__Bacteria; p__Nitrospirota; c__Thermodesulfovibrionia; o__NA; f__NA; g__NA; s__NA"
ampli_packer_main_text_taxa_filtered %>% ampli_filter_strings(col_to_filter = taxonomy, strings_to_filter = packer_main_text_heat_names_taxonomy_vector, detection_method = "complete", action = "keep")  %>% 
  # remove taxa with zero reads
  ampli_rm_0_read_taxa() %>% group_by(taxonomy) %>% summarise(Kingdom = first(Kingdom), Phylum = first(Phylum), Class = first(Class), Order = first(Order), Family = first(Family), Genus = first(Genus), Species = first(Species))
## 1 zero-read taxa removed from data set.
packer_main_text_heat_names_tbl_edit <- packer_main_text_heat_names_tbl  %>% mutate(tax_short = case_when(
  taxonomy == "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrosopumilales; f__Nitrosopumilaceae; g__Candidatus_Nitrosotenuis; s__NA" ~ "g. \\textit{Ca.} Nitrosotenuis",
  taxonomy == "k__Archaea; p__Crenarchaeota; c__Nitrososphaeria; o__Nitrososphaerales; f__Nitrososphaeraceae; g__Candidatus_Nitrososphaera; s__NA" ~ "g. \\textit{Ca.} Nitrososphaera",
  taxonomy == "k__Bacteria; p__Acetothermia; c__Acetothermiia; o__NA; f__NA; g__NA; s__NA" ~ "c. Acetothermiia",
  taxonomy == "k__Bacteria; p__Bacteroidota; c__Ignavibacteria; o__Ignavibacteriales; f__SM1H02; g__NA; s__NA" ~ "f. SM1H02",
  taxonomy == "k__Bacteria; p__Bacteroidota; c__Kryptonia; o__Kryptoniales; f__BSV26; g__NA; s__NA" ~ "f. BSV26",
  taxonomy == "k__Bacteria; p__Chloroflexi; c__P2-11E; o__NA; f__NA; g__NA; s__NA" ~ "c. P2-11E",
  taxonomy == "k__Bacteria; p__Deinococcota; c__Deinococci; o__Thermales; f__Thermaceae; g__Meiothermus; s__NA" ~ "g. \\textit{Meiothermus}",
  taxonomy == "k__Bacteria; p__Firmicutes; c__Dethiobacteria; o__Dethiobacterales; f__Dethiobacteraceae; g__Dethiobacter; s__NA" ~ "g. \\textit{Dethiobacter}",
  taxonomy == "k__Bacteria; p__GAL15; c__NA; o__NA; f__NA; g__NA; s__NA" ~ "p. GAL15",
  taxonomy == "k__Bacteria; p__NA; c__NA; o__NA; f__NA; g__NA; s__NA" ~ "k. Bacteria",
  taxonomy == "k__Bacteria; p__Nitrospirota; c__Nitrospiria; o__Nitrospirales; f__Nitrospiraceae; g__Nitrospira; s__NA" ~ "g. \\textit{Nitrospira}",
  taxonomy == "k__Bacteria; p__Nitrospirota; c__Thermodesulfovibrionia; o__NA; f__NA; g__NA; s__NA" ~ "c. Thermodesulfovibrionia",
  taxonomy == "k__Bacteria; p__Patescibacteria; c__ABY1; o__Candidatus_Magasanikbacteria; f__NA; g__NA; s__NA" ~ "o. \\textit{Ca.} Magasanikbacteria",  
  taxonomy == "k__Bacteria; p__Patescibacteria; c__CPR2; o__NA; f__NA; g__NA; s__NA" ~ "c. CPR2",
  taxonomy == "k__Bacteria; p__Patescibacteria; c__Parcubacteria; o__NA; f__NA; g__NA; s__NA" ~ "c. Parcubacteria",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Parvibaculales; f__Parvibaculaceae; g__Parvibaculum; s__lavamentivorans" ~ "s. \\textit{lavamentivorans}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Azohydromonas; s__NA" ~ "g. \\textit{Azohydromonas}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Brachymonas; s__NA" ~ "g. \\textit{Brachymonas}",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Hydrogenophilaceae; g__Thiobacillus; s__thioparus" ~ "s. \\textit{thioparus}",  
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Sutterellaceae; g__NA; s__NA" ~ "f. Sutterellaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__NA; f__NA; g__NA; s__NA" ~ "c. Gammaproteobacteria",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__NA; s__NA" ~ "f. Xanthomonadaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Silanimonas; s__NA" ~ "g. \\textit{Silanimonas}",
  taxonomy == "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Opitutales; f__Opitutaceae; g__Lacunisphaera; s__NA" ~ "g. \\textit{Lacunisphaera}",
  taxonomy == "Other taxa" ~ "Other taxa"))

packer_main_text_heat_names_tbl_edit
packer_main_text_heat <- ampli_packer_main_text_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = packer_interval, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 23, y_taxa_arrangement = "abund", facet_grid_sample_group_col = year_sampling_site, custom_taxa_names_tbl = packer_main_text_heat_names_tbl_edit, custom_taxa_names_col = tax_short)

packer_main_text_heat +
  # plot geometry
  geom_text(parse = FALSE, size = 3) +
  
  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] grouped by\nyear of sampling and well name", expand = c(0,0)) +
    scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0), labels = TeX) +

  theme_bw(base_size = 11) +

  theme(
    panel.grid = element_blank(),
    # legend.position = "bottom",
    plot.margin =  margin(1,1,1,1)
        )

2018 BA1A filter size fractions

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled == 2018) %>% 
  
  # add column for packer interval
mutate(packer_interval = if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct)))) %>% 
         # add column for filter pore diameter
mutate(filter_diam = 
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "0.45",
  TRUE ~ "0.22"
)) %>% 
  
  ampli_heat_map(x_sample_group_col = filter_diam, plot_other_taxa_bin = TRUE, top_n = 20,  y_taxa_arrangement = "abund", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", facet_grid_sample_group_col = packer_interval) +

  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Filter pore diameter / [µm] grouped by\ndepth of sampling interval / [m] ", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 9) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

CH\(_4\) taxa

CH4_taxa <- c("Methanobacteria", "Methanomicrobia", "Methanopyrales", "Methanocellales", "Methanoplasmatales", "Methanosarcinales", "Methanomassiliicocc", "Methylococc", "Methylocystis", "Methylosinus", "Methylocella", "Methylocapsa", "Methylacidiphil", "Methylomirabilis", "ANME")

CH4_plotted_taxa_names_tbl <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled > 2017) %>% 
# add column combining year and well
mutate(year_sampling_site = paste(year_sampled, sampling_site)) %>% 
# add column combining site, packer_interval, and filter pore diamter
mutate(packer_interval_filter_diam = paste(if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))),# "\n0.22 µm")
                                          
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "\n0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "\n0.45",
  TRUE ~ "\n0.22"
))
) %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, facet_grid_sample_group_col = year_sampling_site, return_plot_data_tbl = TRUE)
## Warning: `as_character()` is deprecated as of rlang 0.4.0
## Please use `vctrs::vec_cast()` instead.
## This warning is displayed once per session.
CH4_plotted_taxa_names_tbl
CH4_plotted_taxa_names_tbl_edit <- CH4_plotted_taxa_names_tbl  %>% mutate(tax_short = case_when(
  taxonomy == "k__Archaea; p__Euryarchaeota; c__Methanobacteria; o__Methanobacteriales; f__Methanobacteriaceae; g__Methanobacterium; s__NA" ~ "g. $\\textit{Methanobacterium}$",
  taxonomy == "k__Bacteria; p__Verrucomicrobiota; c__Verrucomicrobiae; o__Methylacidiphilales; f__Methylacidiphilaceae; g__NA; s__NA" ~ "f. Methylacidiphilaceae",
  taxonomy == "k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Methylococcales; f__Methylococcaceae; g__Methylocaldum; s__NA" ~ "g. \\textit{Methylocaldum}"
))

CH4_plotted_taxa_names_tbl_edit
CH4_plotted_taxa_names_tbl_edit %>% group_by(year_sampling_site, packer_interval_filter_diam) %>% summarise()
## `summarise()` has grouped output by 'year_sampling_site'. You can override using the `.groups` argument.
ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam <- ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # remove samples taken prior to 2018
  filter(year_sampled > 2017) %>% 
# add column combining year and well
mutate(year_sampling_site = paste(year_sampled, sampling_site)) %>% 
# add column combining site, packer_interval, and filter pore diameter
mutate(packer_interval_filter_diam = paste(if_else(str_detect(sampling_method, "packer") == FALSE, NA_character_, paste0(if_else(upper_packer_inflated == FALSE, 0, upper_packer_depth_mbct), "-", if_else(lower_packer_inflated == FALSE, well_depth_mbgl, lower_packer_depth_mbct))),# "\n0.22 µm")
                                          
       case_when(
  str_detect(sample_id, "_10_") == TRUE ~ "\n0.10",
  str_detect(sample_id, "_45_")  == TRUE ~ "\n0.45",
  TRUE ~ "\n0.22"
))
)

# sort levels for plotting
ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam$packer_interval_filter_diam <- factor(ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam$packer_interval_filter_diam, levels = c(
  "55-66 \n0.10",
  "55-66 \n0.22",
  "55-66 \n0.45",
  "100-400 \n0.10",
  "100-400 \n0.22",
  "100-400 \n0.45",
  "0-30 \n0.22",
  "41-65 \n0.22",
  "108-132 \n0.22",
  "45-75 \n0.22",
  "102-132 \n0.22"
))
ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam  %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = CH4_taxa, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 20, facet_grid_sample_group_col = year_sampling_site, custom_taxa_names_tbl = CH4_plotted_taxa_names_tbl_edit, custom_taxa_names_col = tax_short) +

  # plot geometry
  geom_text(parse = FALSE) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "white", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] and filter pore diameter / [µm]\ngrouped by year of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest\ntaxonomic assignment", expand = c(0,0), labels = TeX) +

  theme_bw(base_size = 10.5) +

  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

\(\text{C}_{2}\)-\(\text{C}_{6}\) Short-Chain Alkane (SCA)-consuming taxa

# input SCA-consuming taxa from Singh et al. (2017), Laso-Pérez et al. (2019), and Shennan et al. (2006)

all_SCA_taxa_oman <- c("BuS5", "Syntrophoarchaeum", "HotSeep-1", "Argoarchaeum", "ethanivorans", "Methanoliparia", "GoM", "Actinomyces", "Arthrobacter", "Brevibacterium", "Corynebacterium", "Gordonia", "Mycobacterium", "Nocardia", "Nocardiodes", "Rhodococcus", "Acinetobacter", "cepacia", "Pseudomonas", "Ralstonia")
ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  ampli_heat_map(x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = all_SCA_taxa_oman, plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 1, text_label_format = "scientific") +
  # plot geometry
  geom_text(parse = TRUE, size = 2.1) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 6.5) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank()
        )

There were no species-level assignments to known short-chain alkane oxidizing taxa. The genera shown above contain organisms capable of many metabolic processes unrelated to short-chain alkane oxidation, so their presence is not strong evidence for microbial short-chain alkane oxidation.

Hydrogenophaga

hydrogenophaga_taxa_oman <- c("Hydrogenophaga")
ampli_OM18_thru_19_focus_samples_taxa_filtered_packer_interval_filter_diam %>% 
  ampli_heat_map(x_sample_group_col = packer_interval_filter_diam, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = hydrogenophaga_taxa_oman, plot_other_taxa_bin = FALSE, text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", top_n = 50, facet_grid_sample_group_col = year_sampling_site, y_taxa_col = Genus) +

  # plot geometry
  geom_text(parse = FALSE) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "white", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Depth of sampling interval / [m] and filter pore diameter / [µm]\ngrouped by year of sampling and well name", expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 11) +

  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    plot.margin =  margin(1,1,1,1),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
        )
## Warning: Unknown levels in `f`: Other taxa

2017 drill foam samples

All taxa

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # filter for drill foam samples
  filter(sample_type == "drill foam emerged from well") %>% 
  ampli_heat_map(x_sample_group_col = sample_id, plot_other_taxa_bin = TRUE, top_n = 20,  y_taxa_arrangement = "abund", text_label_scalar = 100, text_label_decimal_places = 0, text_label_threshold = 0.01, text_label_zero = "n.r.", text_label_threshold_round_priority = "round", facet_grid_sample_group_col = nucleic_acid_type) +

  # plot geometry
  geom_text(parse = FALSE, size = 2.5) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = "Replicate grouped by nucleic acid", expand = c(0,0)) +
  scale_y_discrete(name = "Deepest taxonomic assignment", expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.margin =  margin(1,1,1,1)
        )

Brachymonas and Parvibaculum

Brachymonas and Parvibaculum were not detected. I also included Pseudomonas in the search here just so that the plot would still have an output.

ampli_OM17_thru_19_focus_samples_taxa_filtered %>%
  # filter for drill foam samples
  filter(sample_type == "drill foam emerged from well") %>%
  ampli_heat_map(x_sample_group_col = sampling_site, taxa_selection_method = "custom_taxa_char_vector", custom_taxa_char_vector = c("Brachymonas", "Parvibaculum", "Pseudomonas"), plot_other_taxa_bin = FALSE, y_taxa_arrangement = "alpha", text_label_scalar = 100, text_label_decimal_places = 1, text_label_format = "scientific") +
  # plot geometry
  geom_text(parse = TRUE, size = 2.1) +

  # plot styling
  scale_fill_gradient(name = "Read relative\nabundance / [%]", low = "white", high = "red", labels = label_percent(accuracy = 1, suffix = "")) +
  scale_x_discrete(name = NULL, expand = c(0,0)) +
  scale_y_discrete(name = NULL, expand = c(0,0)) +

  theme_bw(base_size = 8) +

  theme(
    legend.position = "bottom",
    panel.grid = element_blank()
        )