## Loading required package: knitr

1 Maps and other figures generated outside of R

1.1 Stations sampled in 2014

knitr::include_graphics("Duhaime_Fig1.png")

1.2 Sample processing

knitr::include_graphics("Duhaime_Fig2_sample_images.png")

1.3 Sample concentrations

knitr::include_graphics("Duhaime_Fig3_count_maps_total.png")

1.4 SEM-EDS example images

knitr::include_graphics("Duhaime_Fig7_EDS_spectra.png")

1.5 Transport model outputs

knitr::include_graphics("Duhaime_Fig9_model_months.png")

knitr::include_graphics("Duhaime_Fig10_residence_time_model.png")

knitr::include_graphics("Duhaime_SI_Fig4.png")

2 Plastic concentrations

2.1 Importing and Managing Count Data and Trawl Metadata

## Read in csv
## total_conc_km2 column is summation of counts from all non-fiber plastic types: does not include fibers

gl.2014.plastics.conc.df <- 
  read.csv(file = "~/Box Sync/Duhaime Lab/R projects/gl_2014_count_data_github/final_directory/duhaime_gl_2014_plastic_concentrations.csv")

# Set factor levels for size fractions

gl.2014.plastics.conc.df$size_fraction_um <-
  factor(gl.2014.plastics.conc.df$size_fraction_um, levels = c('106-1000', '1000-4750', '>4750'))

We currently have data for 38 stations and 108 trawls.

## Display data table of concentrations

datatable(gl.2014.plastics.conc.df, 
          options = list(pageLength = 10),
          caption = "Total concentrations, long format"
          )

Note that the NK 0009-2 106-1000um fragment concentration is an order of magnitude larger than all others; this is an unreliable count on an anomalously difficult sample, so we remove this concentration from the data set.

## Remove unreliable data

remove_row_num <- which(gl.2014.plastics.conc.df$'trawl' == "NK 0009-2" & gl.2014.plastics.conc.df$'size_fraction_um' == "106-1000")

gl.2014.plastics.conc.df[remove_row_num, 4:13] <- NA

table.gl.2014.conc.by.size.comp.melt <- 
  melt(gl.2014.plastics.conc.df, 
       measure.vars = c('fragment_conc_km2', 
                        'film_conc_km2', 
                        'line_conc_km2', 
                        'nurdle_conc_km2', 
                        'sphere_conc_km2', 
                        'foam_conc_km2', 
                        'paint_conc_km2',
                        'total_conc_km2', 
                        'fiber_conc_km2'), 
       id.vars = c('station', 'trawl', 'size_fraction_um'), 
       value.name = "concentration", 
       variable.name = "conc_type", 
       na.rm = FALSE)

2.1.1 Trawl metadata

gl.2014.weather.data <- read.csv("~/Box Sync/Duhaime Lab/R projects/gl_2014_count_data_github/final_directory/duhaime_gl_2014_trawl_metadata.csv")

## Rearrange factor levels for station type and water body

gl.2014.weather.data$water_body <- factor(gl.2014.weather.data$water_body, levels = c("Lake Superior", "Lake Huron", "Lake St. Clair", "Detroit Rv.", "Lake Erie", "Niagara Rv."))

gl.2014.weather.data$station_class <- factor(gl.2014.weather.data$station_class, levels = c("Basin", "Non urban", "Urban", "River plume", "WWTP"))

datatable(gl.2014.weather.data, options = list(
  pageLength = 10,
  autoWidth = TRUE),
  caption = "Station metadata"
)

## Merge trawl datasets for correlations and plotting

gl.2014.all.data <- 
  merge(gl.2014.plastics.conc.df, gl.2014.weather.data, 
        by = c("trawl","station"), 
        all.x = TRUE, 
        suffixes = c(".x", ".y"))

## Create dataframe of only nonnumeric trawl metadata

gl.2014.trawl.data.nonnumeric<- 
  gl.2014.weather.data[
    sapply(
      gl.2014.weather.data, 
      function(x) !is.numeric(x))]

# Merge with per-trawl concentration data

gl.2014.all.data.nonnumeric <- merge(gl.2014.plastics.conc.df, 
                                     gl.2014.trawl.data.nonnumeric, 
                                     by = c("trawl","station"), 
                                     all.x = TRUE, 
                                     suffixes = c(".x", ".y"))

datatable(gl.2014.all.data.nonnumeric, 
          options = list(pageLength = 10),
          caption = "Trawl concentrations and non-numeric metadata"
          )

# Transform trawl data into long format by concentrations for plotting and stats

gl.all.trawl.data.nonnumeric.long <-
  melt(gl.2014.all.data.nonnumeric, 
       measure.vars = c('fragment_conc_km2', 
                        'film_conc_km2', 
                        'line_conc_km2', 
                        'nurdle_conc_km2', 
                        'sphere_conc_km2', 
                        'foam_conc_km2', 
                        'paint_conc_km2', 
                        'total_conc_km2', 
                        'fiber_conc_km2'), 
       id.vars = c('trawl', 
                   'station', 
                   'size_fraction_um', 
                   'trawl_date', 
                   'trawl_start_time', 
                   'water_body',
                   'location', 
                   'station_class'), 
       value.name = "concentration",
       variable.name = "conc_type", 
       na.rm = FALSE)

gl.all.trawl.data.nonnumeric.long$conc_type <- 
  factor(gl.all.trawl.data.nonnumeric.long$conc_type, 
         levels = c("fragment_conc_km2",
                    "line_conc_km2",
                    "film_conc_km2",
                    "foam_conc_km2",
                    "nurdle_conc_km2",
                    "sphere_conc_km2",
                    "paint_conc_km2",
                    "total_conc_km2",
                    "fiber_conc_km2"))

#fix instances of gl.all.trawl.data.nonnumeric.long.tot
gl.all.trawl.data.nonnumeric.long.tot <- 
  subset(
    gl.all.trawl.data.nonnumeric.long, 
    conc_type == 'total_conc_km2')

gl.all.trawl.data.nonnumeric.long.tot.noNA <-
  subset(gl.all.trawl.data.nonnumeric.long.tot,
         !is.na(concentration))

#View(gl.all.trawl.data.nonnumeric.long.tot)

gl.all.trawl.data.nonnumeric.long.comp <- 
  subset(
    gl.all.trawl.data.nonnumeric.long, 
    conc_type != 'total_conc_km2' & conc_type != 'fiber_conc_km2')
#View(gl.all.trawl.data.nonnumeric.long.comp)

gl.all.trawl.data.nonnumeric.long.comp.wfiber <- 
  subset(
    gl.all.trawl.data.nonnumeric.long, 
    conc_type != 'total_conc_km2')

gl.all.trawl.data.nonnumeric.long.comp.wfiber.noNA <- 
  subset(gl.all.trawl.data.nonnumeric.long.comp.wfiber,
         !is.na(concentration))

dim(gl.all.trawl.data.nonnumeric.long.comp)

## [1] 2268   10

gl.all.trawl.data.nonnumeric.long.comp.noNA <-
  subset(gl.all.trawl.data.nonnumeric.long.comp, 
         !is.na(concentration))
dim(gl.all.trawl.data.nonnumeric.long.comp.noNA)

## [1] 1540   10

#View(gl.all.trawl.data.nonnumeric.long.comp.noNA)

2.1.1.1 Trawl totals across all size fractions

gl.2014.plastics.tot.conc <- 
  as.data.frame(gl.2014.plastics.conc.df[,c('trawl', 
                              'station', 
                              'size_fraction_um', 
                              'total_conc_km2')])

gl.2014.plastics.tot.conc.nona <- 
  na.omit(gl.2014.plastics.tot.conc)

gl.2014.plastics.tot.wide <-
  dcast(data = gl.2014.plastics.tot.conc.nona,
        formula = trawl + station ~ size_fraction_um,
        value.var = 'total_conc_km2',
        fill = NA_real_)

gl.2014.plastics.tot.wide.complete.only <-
  as.data.frame(na.omit(gl.2014.plastics.tot.wide))

gl.2014.plastics.tot.wide.complete.only$total_concentration <- 
  gl.2014.plastics.tot.wide.complete.only$'106-1000' + 
  gl.2014.plastics.tot.wide.complete.only$'1000-4750' +
  gl.2014.plastics.tot.wide.complete.only$'>4750'

gl.2014.plastics.tot.wide.all.data <-
  merge(gl.2014.plastics.tot.wide.complete.only, 
        gl.2014.weather.data, 
        by = c("trawl","station"), 
        all.x = TRUE, 
        suffixes = c(".x", ".y"))

gl.2014.plastics.tot.wide.all.data <-
gl.2014.plastics.tot.wide.all.data[,c("trawl", 
  "station",
  "water_body",
  "location",
  "station_class",
  "106-1000",
  "1000-4750",
  ">4750",
  "total_concentration",
  "trawl_date",
  "trawl_start_time",
  "flowm_dist_m",
  "flowm_area_km2",
  "wet_mass",
  "dry_mass",
  "air_vel_avg_kt",
  "cloud_cover_avg",
  "air_temp_avg_C",
  "water_temp_avg_C",
  "E_wtr_vel_surf_avg_kt",
  "N_wtr_vel_surf_avg_kt",
  "wv_drctn_avg",
  "wv_prd_avg",
  "sig_wv_ht_avg_m")]

datatable(gl.2014.plastics.tot.wide.all.data,
          options = list(pageLength = 25),
          caption = "Total of all fractions for complete trawls (km-2)")

write.csv(gl.2014.plastics.tot.wide.all.data, 
          file = "./outfiles_final/final_data_sheet_1_total_concentration_all_sizes_all_data.csv", 
          na = "NA",
          row.names = FALSE)

2.1.2 Station metadata

# Create a wide dataframe of trawl mean concentrations by size class, maintaining nonnumeric metadata and concentration components

gl.all.trawl.data.nonnumeric.long.nona <- 
  na.omit(gl.all.trawl.data.nonnumeric.long)

gl.all.data.nonnnumeric.mean.stn.wide <- 
  dcast(
    data=gl.all.trawl.data.nonnumeric.long.nona,
    station_class+water_body+location+trawl_date +station+size_fraction_um ~ conc_type,
    mean,
    value.var = 'concentration'
    )

# Create a long dataframe of trawl mean concentrations by size class for plotting and stats, maintaining nonnumeric metadata and concentration components

gl.all.data.nonnumeric.mean.stn.long <- 
  melt(
    gl.all.data.nonnnumeric.mean.stn.wide,
    measure.vars = c('fragment_conc_km2',
                     'film_conc_km2', 
                     'line_conc_km2', 
                     'nurdle_conc_km2', 
                     'sphere_conc_km2', 
                     'foam_conc_km2', 
                     'paint_conc_km2', 
                     'total_conc_km2', 
                     'fiber_conc_km2'), 
    id.vars = c('station', 
                'size_fraction_um', 
                'trawl_date', 
                'water_body', 
                'location', 
                'station_class'), 
   value.name = "mean_concentration", 
   variable.name = "conc_type"
   )

datatable(gl.all.data.nonnumeric.mean.stn.long, 
          options = list(pageLength = 10),
          caption = "Mean concentrations and nonnumeric metadata, long format"
          )

2.2 Exploring Data

2.2.1 Count Data Shape

A simple histogram of all total concentrations by size fraction was created to establish an appreciation for shape of the data.

table.gl.2014.conc.by.size.comp.melt.nona <- 
  na.omit(table.gl.2014.conc.by.size.comp.melt)

table.gl.2014.conc.by.size.comp.melt.nona.tot <- 
   subset(
      table.gl.2014.conc.by.size.comp.melt.nona, 
      conc_type == 'total_conc_km2')

ggplot(
  table.gl.2014.conc.by.size.comp.melt.nona.tot, 
  aes(x = concentration)) +
  geom_histogram(bins = 500) +
  facet_grid(size_fraction_um ~ ., 
             scales = "free")

Repeat the same for the component concentrations.

table.gl.2014.conc.by.size.comp.melt.nona.comp <- 
   subset(
      table.gl.2014.conc.by.size.comp.melt.nona, 
      conc_type != 'total_conc_km2')

The 106-1000 µm fraction is pretty evenly distributed (likely because data are more sparse–due to labor intensiveness, not all trawls were counted) and the concentrations for the larger two fractions are heavily postitively skewed (toward the zeroes and smaller numbers). We can also see this in the skewness (very positive) and kurtosis (high positive numbers) for the total concentrations.

trawl.tot.conc.skew.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('size_fraction_um'),
    function(x) round(skewness(x$'concentration'), 2))

trawl.tot.conc.kurtosis.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('size_fraction_um'), 
    function(x) round(kurtosis(x$'concentration'), 2))

trawl.tot.conc.skew.kurt.by.size <- 
  cbind(
    trawl.tot.conc.skew.by.size,
    Kurt. = trawl.tot.conc.kurtosis.by.size$V1
    )
colnames(trawl.tot.conc.skew.kurt.by.size) <-
  c('size_fraction_um', 'Skew.', 'Kurt.')

datatable(
  trawl.tot.conc.skew.kurt.by.size,
  options = list(pageLength = 3),
  caption = "Skewdness and kurtosis of trawl total plastic concentrations (km^-2)")

2.2.2 Testing Normality

Based on visual inspection, these data do not appear to have a normal distribution. We next test this hypothesis of non-normality with the Shapiro-Wilks test: are the concentrations non-normally distributed? (smaller p-value, higher probability of being non-normal)

shap.test.trawl.conc.type.W <- aggregate(concentration ~ conc_type, data = gl.all.trawl.data.nonnumeric.long, FUN = function(x) shapiro.test(x)$statistic)

shap.test.trawl.conc.type.pval <- aggregate(concentration ~ conc_type, data = gl.all.trawl.data.nonnumeric.long, FUN = function(x) shapiro.test(x)$p.value)

shap.test.trawl.conc.type.pval

##           conc_type concentration
## 1 fragment_conc_km2  1.308586e-28
## 2     line_conc_km2  8.529583e-25
## 3     film_conc_km2  5.179129e-25
## 4     foam_conc_km2  3.103382e-28
## 5   nurdle_conc_km2  1.200617e-30
## 6   sphere_conc_km2  2.406124e-29
## 7    paint_conc_km2  3.243382e-27
## 8    total_conc_km2  4.203199e-28
## 9    fiber_conc_km2  2.584405e-29

shap.test.trawl.size.frac.W <- aggregate(concentration ~ size_fraction_um, data = gl.all.trawl.data.nonnumeric.long, FUN = function(x) shapiro.test(x)$statistic)

shap.test.trawl.size.frac.pval <- aggregate(concentration ~ size_fraction_um, data = gl.all.trawl.data.nonnumeric.long, FUN = function(x) shapiro.test(x)$p.value)

shap.test.trawl.size.frac.pval

##   size_fraction_um concentration
## 1         106-1000  1.068027e-10
## 2        1000-4750  1.020828e-52
## 3            >4750  6.743893e-55

shap.test.stn.conc.type.W <- 
  aggregate(mean_concentration ~ conc_type,
            data = gl.all.data.nonnumeric.mean.stn.long,
            FUN = function(x) shapiro.test(x)$statistic)

shap.test.stn.conc.type.pval <- aggregate(mean_concentration ~ conc_type, 
                                          data = 
                                        gl.all.data.nonnumeric.mean.stn.long,
                                          FUN = function(x) 
                                            shapiro.test(x)$p.value)

shap.test.stn.conc.type.pval

##           conc_type mean_concentration
## 1 fragment_conc_km2       4.048243e-17
## 2     film_conc_km2       1.111000e-13
## 3     line_conc_km2       1.399332e-12
## 4   nurdle_conc_km2       4.725135e-18
## 5   sphere_conc_km2       1.127956e-16
## 6     foam_conc_km2       5.856112e-16
## 7    paint_conc_km2       1.866749e-14
## 8    total_conc_km2       7.522970e-17
## 9    fiber_conc_km2       1.578691e-16

shap.test.stn.size.frac.W <- aggregate(mean_concentration ~ size_fraction_um, 
                                       data = 
                                        gl.all.data.nonnumeric.mean.stn.long,
                                       FUN = function(x) 
                                         shapiro.test(x)$statistic)

shap.test.stn.size.frac.pval <- aggregate(mean_concentration ~ size_fraction_um, 
                                          data = 
                                        gl.all.data.nonnumeric.mean.stn.long, 
                                          FUN = function(x) 
                                            shapiro.test(x)$p.value)

shap.test.stn.size.frac.pval

##   size_fraction_um mean_concentration
## 1         106-1000       6.083180e-09
## 2        1000-4750       1.047869e-33
## 3            >4750       2.569836e-36

According to the Shapiro Wilks test on all component trawl concentrations and total concentrations, the distributions vary significantly from normal (p<<<0.5). The same result is observed with station concentrations, although the p-values are higher for station means.

Conclusion: None of these datasets are normally distributed, so parametric tests are not an option without data transformations.

2.2.3 Centrality by Mean or Median?

Based on the data shape, we next ask: what best describes the centrality of these data: mean or median of concentrations?

Mean (red dashed line) and median (blue dotted line) for the total concentration histograms were plotted.

table.gl.2014.conc.by.size.comp.melt.nona.tot.means <-
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    "size_fraction_um", 
    summarise, 
    size.fraction.mean = mean(concentration))

table.gl.2014.conc.by.size.comp.melt.nona.tot.medians <-
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    "size_fraction_um", 
    summarise, 
    size.fraction.median = median(concentration))

ggplot(
  table.gl.2014.conc.by.size.comp.melt.nona.tot, 
  aes(x = concentration)) +
  geom_histogram(bins = 500) + 
  facet_grid(size_fraction_um ~ ., 
             scales = "free") + 
  scale_x_continuous(limits = c(0, 1500000))+
  geom_vline(data = table.gl.2014.conc.by.size.comp.melt.nona.tot.means,
             aes(xintercept= size.fraction.mean),
             linetype = "dashed",
             size = 0.5, 
             color = "red") + 
    geom_vline(data = table.gl.2014.conc.by.size.comp.melt.nona.tot.medians,
             aes(xintercept= size.fraction.median),
             linetype = "dotted",
             size = 0.5, 
             color = "blue") +
    annotate("text", label = "blue = median   red = mean", size = 3, x = 1500000, y = 3)

## Warning: Removed 1 rows containing non-finite values (stat_bin).

Mean appears to represent the data better than the median, because although it doesn’t fall where the majority of the data are, it falls more central to the full range of data.

Representing the spread of the data, however, is going to be more difficult because the data are:
* so postitively skewed that the standard deviation will be very large
* limited by zero on the lower end but not limited on the upper end

A quick summary of the total concentrations also exemplifies this issue, and how it varies across size fractions. The mean concentration for each fraction is suprisingly close to (and for the smallest fraction, greater than) the upper quartile (75th percentile) value. This means ~75% of the concentrations in each size fraction are below the mean value.

trawl.tot.conc.summary.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('size_fraction_um'), 
    function(x) summary(x$'concentration'))

trawl.tot.conc.sd.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('size_fraction_um'), 
    function(x) sd(x$'concentration'))

trawl.tot.conc.iqr.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('size_fraction_um'), 
    function(x) IQR(x$'concentration'))

trawl.tot.conc.summary.by.size$StDev. <- 
  round(trawl.tot.conc.sd.by.size$V1, 0)

trawl.tot.conc.summary.by.size$IQR <- 
  round(trawl.tot.conc.iqr.by.size$V1, 0)

datatable(
  trawl.tot.conc.summary.by.size,
  options = list(pageLength = 3),
  caption = "Summary of trawl total plastic concentrations (km^-2)")

The Q-Q plot below shows that the concentrations in the 1000-4750 um size class deviate from linearity and are thus non-normally distributed–with skew particularly evident at higher concentrations.

params.km <- as.list(fitdistr(subset(
      table.gl.2014.conc.by.size.comp.melt.nona.tot$concentration, 
      table.gl.2014.conc.by.size.comp.melt.nona.tot$size_fraction_um == '1000-4750'), "exponential")$estimate)

ggplot(
  data = subset(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    size_fraction_um == '1000-4750'), 
  geom = 'blank') +
  stat_qq(aes(sample = concentration), 
          distribution = qexp, 
          dparams = params.km) + 
  geom_abline(slope = 1, 
              intercept = 0)

The above spread can also be represented as a boxplot (red point is mean value). Note that the means are higher than the median (central line of boxplot), and for the mid and large size class are greater than the 75% percentile (top of box).

ggplot(table.gl.2014.conc.by.size.comp.melt.nona.tot, aes(x = size_fraction_um, y = concentration)) +
  geom_boxplot(aes(y = (concentration+.01), x = factor(size_fraction_um))) + 
  stat_summary(fun.y="mean", geom="point",colour = "red", size = 2)+
 # scale_y_log10() + 
  theme(axis.text.x = element_text(size = 10, angle = 45, hjust = 1))

2.3 Data by trawl (each trawl treated individually)

2.3.1 Summary tables by size fraction

table.gl.2014.conc.by.size.comp.melt.nona <- 
  na.omit(table.gl.2014.conc.by.size.comp.melt)

trawl.prop.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.conc.by.size.comp.melt.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.conc.by.size.comp.sum <-
  aggregate(data=table.gl.2014.conc.by.size.comp.melt.nona,
            concentration~size_fraction_um+conc_type,
            sum)
 
table.gl.2014.conc.by.size.comp.sum.cast <-
  dcast(data = table.gl.2014.conc.by.size.comp.sum,
        formula = size_fraction_um ~ conc_type,
        fill = NA,
        value.var = "concentration")

table.gl.2014.conc.totals.by.size.comp.prop <-
  cbind(table.gl.2014.conc.by.size.comp.sum.cast$'size_fraction_um', 
        table.gl.2014.conc.by.size.comp.sum.cast[,2:8]/table.gl.2014.conc.by.size.comp.sum.cast$'total_conc_km2')

conc.type.col.names <- c("Size (µm)", 
                         "Fragment", 
                         "Film", 
                         "Line", 
                         "Nurdle", 
                         "Sphere", 
                         "Foam", 
                         "Paint")

colnames(table.gl.2014.conc.totals.by.size.comp.prop) <- conc.type.col.names

table.gl.2014.conc.totals.by.size.comp.prop$n <- 
  trawl.prop.conc.size.count$total_conc_km2

# kable(
#   table.gl.2014.conc.totals.by.size.comp.prop, 
#   digits = 3, 
#   caption = "Proportion of summed individual trawl concentration")

xtable.prop.conc.types <- 
  xtable(table.gl.2014.conc.totals.by.size.comp.prop, 
         digits = 3, 
         caption = "Proportion of individual trawl concentration")

prop.conc.data <- 
  write.csv(
    table.gl.2014.conc.totals.by.size.comp.prop, 
    file = "./outfiles_final/final_proportion_total_conc_by_size_table.csv", 
    na = "NA")

table.gl.2014.conc.totals.by.size.comp.prop.round <- 
  cbind('Size (µm)' = table.gl.2014.conc.totals.by.size.comp.prop[,1], 
        round(table.gl.2014.conc.totals.by.size.comp.prop[,-1],3))

datatable(table.gl.2014.conc.totals.by.size.comp.prop.round,
          options = list(pageLength = 3),
          caption = "Proportion of individual trawl concentration"
          )

table.conc.tot.by.size.comp.prop.long <- 
  melt(table.gl.2014.conc.totals.by.size.comp.prop, 
       measure.vars = c("Fragment", 
                         "Film", 
                         "Line", 
                         "Nurdle", 
                         "Sphere", 
                         "Foam", 
                         "Paint"), 
       id.vars = c("Size (µm)"), 
       value.name = "Proportion", 
       variable.name = "Plastic type", 
       na.rm = TRUE)

classPalette <- c("deepskyblue3", "orange3", "firebrick","darkolivegreen3","deeppink3","gold1", "cadetblue3", "coral1")

# ggplot(table.conc.tot.by.size.comp.prop.long, 
#        aes(x = as.factor("Size (µm)"), 
#            y = Proportion, 
#            fill = "Plastic type")) +   
#   theme_bw()+
#   scale_fill_manual(values=classPalette)+
#   geom_bar() +
#   xlab("plastic size fraction (µm)")+
#   ylab("% of total plastic counted")+
#   theme(axis.text.x = element_text(angle = 45, hjust = 1))+
#   ggtitle("Sample composition by plastic shape class")

table.gl.2014.conc.by.size.comp.melt.nona <- 
  na.omit(table.gl.2014.conc.by.size.comp.melt)

trawl.mean.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.conc.by.size.comp.melt.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.conc.by.size.comp.mean <-
  aggregate(data=table.gl.2014.conc.by.size.comp.melt.nona, 
            concentration~size_fraction_um+conc_type,
            mean)
 
table.gl.2014.conc.by.size.comp.mean.cast <-
  dcast(table.gl.2014.conc.by.size.comp.mean,
        size_fraction_um~conc_type,
        fill = NA,
        value.var = "concentration")

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(table.gl.2014.conc.by.size.comp.mean.cast) <- 
  conc.type.col.names.w.tot

table.gl.2014.conc.by.size.comp.mean.cast$n <- 
  trawl.mean.conc.size.count$total_conc_km2

xtable.mean.conc.types <- 
  xtable(table.gl.2014.conc.by.size.comp.mean.cast, 
         digits = 3, 
         caption = "Mean of individual trawl concentration")

mean.conc.data <- 
  write.csv(
    table.gl.2014.conc.by.size.comp.mean.cast, 
    file = "./outfiles_final/final_trawl_mean_conc_by_size_table.csv", 
    na = "NA")

table.gl.2014.conc.by.size.comp.mean.cast.round <- 
  cbind('Size (µm)' = table.gl.2014.conc.by.size.comp.mean.cast[,1], 
        round(table.gl.2014.conc.by.size.comp.mean.cast[,-1],3))

table.gl.2014.conc.by.size.comp.melt.nona <- 
  na.omit(table.gl.2014.conc.by.size.comp.melt)

trawl.sd.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.conc.by.size.comp.melt.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.conc.by.size.comp.sd <-
  aggregate(data=table.gl.2014.conc.by.size.comp.melt.nona, 
            concentration~size_fraction_um+conc_type,
            sd)
 
table.gl.2014.conc.by.size.comp.sd.cast <-
  dcast(table.gl.2014.conc.by.size.comp.sd,
        size_fraction_um~conc_type,
        fill = NA,
        value.var = "concentration")

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(table.gl.2014.conc.by.size.comp.sd.cast) <- conc.type.col.names.w.tot

table.gl.2014.conc.by.size.comp.sd.cast$n <- 
  trawl.sd.conc.size.count$total_conc_km2

xtable.sd.conc.types <- 
  xtable(table.gl.2014.conc.by.size.comp.sd.cast, 
         digits = 3, 
         caption = "Standard deviation of individual trawl concentration")

sd.conc.data <- 
  write.csv(
    table.gl.2014.conc.by.size.comp.sd.cast, 
    file = "./outfiles_final/final_sd_conc_by_size_table.csv", 
    na = "NA")

table.gl.2014.conc.by.size.comp.sd.cast.round <- 
  cbind('Size (µm)' = table.gl.2014.conc.by.size.comp.sd.cast[,1], 
        round(table.gl.2014.conc.by.size.comp.sd.cast[,-1],3))

conc.mean.matrix <- 
  as.matrix(
    round(
      table.gl.2014.conc.by.size.comp.mean.cast[,2:10], 0))

conc.sd.matrix <- 
  as.matrix(
    round(
      table.gl.2014.conc.by.size.comp.sd.cast[,2:10], 0))

conc.mean.sd <- 
  as.data.frame(
    matrix(
      paste(
        format(
          conc.mean.matrix), 
        format(
          conc.sd.matrix), 
        sep =" +/- "), 
      nrow = nrow(conc.mean.matrix),
      dimnames = dimnames(conc.mean.matrix)))
                              
trawl.conc.size.mean.sd <- cbind(
  table.gl.2014.conc.by.size.comp.mean.cast$`Size (µm)`, 
  conc.mean.sd
)

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(trawl.conc.size.mean.sd) <- conc.type.col.names.w.tot

trawl.conc.size.mean.sd$n <- 
  trawl.mean.conc.size.count$total_conc_km2

xtable.trawl.conc.size.mean.sd <- 
  xtable(trawl.conc.size.mean.sd, 
         caption = "Mean and standard deviation of individual trawl concentration (km^-2)")

mean.sd.conc.data <- 
  write.csv(
    trawl.conc.size.mean.sd, 
    file = "./outfiles_final/final_mean_sd_trawl_conc_by_size_table.csv", 
    na = "NA")

datatable(trawl.conc.size.mean.sd,
          options = list(pageLength = 3),
          caption = "Mean and standard deviation of individual trawl concentration (km^-2)"
          )

2.3.2 Figures per trawl by size

fracpal <- c("#ebb444", "#1daf7e", "#8c1717")

ggplot(gl.all.trawl.data.nonnumeric.long.comp, 
       aes(x = size_fraction_um, 
           y = concentration+1)) + 
  geom_boxplot(aes(fill = size_fraction_um, alpha = 0.5)) + 
 #geom_point(position = position_jitter(width = 0.5), colour="gray48")+
  theme_bw()+
  scale_y_log10(limits = c(100, 2000000), breaks = c(100, 1000, 10000, 100000, 100000, 1000000)) + 
    xlab("")+
  ylab("% of total plastic counted")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  labs(x= "particle size class (µm)", y = "plastic concentration per trawl (km-2)") + 
  scale_fill_manual(values = fracpal) + 
  theme(legend.position="none") +
  ggtitle("Trawl plastic concentrations per size class")

## Warning: Removed 1862 rows containing non-finite values (stat_boxplot).

gl.all.trawl.data.nonnumeric.long.comp.noNA$conc_type <- factor(gl.all.trawl.data.nonnumeric.long.comp.noNA$conc_type, levels = c("fragment_conc_km2","line_conc_km2","film_conc_km2","foam_conc_km2","nurdle_conc_km2","sphere_conc_km2","paint_conc_km2"))

# define a custom color palette for the shape classes
classPalette <- c("deepskyblue3", "firebrick","darkolivegreen3","deeppink3","gold1", "cadetblue3", "coral1")

ggplot(gl.all.trawl.data.nonnumeric.long.comp.noNA, 
       aes(x = factor(size_fraction_um), 
           y=concentration,
           fill = conc_type)) +   
  theme_bw()+
  scale_fill_manual(values=classPalette)+
  geom_bar(stat = "identity", position = "fill") +
  xlab("plastic size fraction (µm)")+
  ylab("% of total plastic counted")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_y_continuous(labels = percent_format()) +
  ggtitle("Sample composition by plastic shape class")

## Error in check_breaks_labels(breaks, labels): could not find function "percent_format"

ggplot(gl.all.trawl.data.nonnumeric.long.comp.noNA, aes(x = factor(conc_type), y=concentration,fill = conc_type)) +   
  theme_bw()+
  scale_fill_manual(values=classPalette)+
  geom_boxplot() +
  scale_y_log10(breaks = c(100, 1000, 10000, 100000, 100000, 1000000)) + 
  xlab("plastic morphology class")+
  ylab("plastic concentration (km-2)")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  facet_grid(size_fraction_um~station_class)+
  ggtitle("Trawl concentrations by plastic size class, shape, and station type")

## Warning: Removed 1134 rows containing non-finite values (stat_boxplot).

stationPalette <- c( "blue","cyan","maroon","red","orange")

ggplot(gl.all.trawl.data.nonnumeric.long.tot, 
       aes(x = factor(water_body), 
           y=concentration)) +   
  theme_bw()+
  scale_colour_manual(values=stationPalette, name="Station Class")+
  geom_boxplot() +
  geom_jitter(aes(color = station_class), size=1) +
  scale_y_log10(breaks = c(100, 1000, 10000, 100000, 100000, 1000000)) + 
  xlab("Water Body")+
  ylab("plastic concentration (km-2)")+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  facet_grid(.~size_fraction_um)+
  ggtitle("Trawl concentrations by water body and station type")

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

## Warning: Removed 80 rows containing missing values (geom_point).

ggplot(gl.all.trawl.data.nonnumeric.long.tot, 
       aes(x = factor(station_class), 
           y=concentration)) +   
  theme_bw()+
  geom_boxplot() +
  geom_jitter(aes(color = water_body, shape = water_body, fill = water_body), size=1.5) +
  scale_shape_manual(values=c(21,22,23,24,25,8)) +
  scale_y_log10(breaks = c(100, 1000, 10000, 100000, 100000, 1000000)) + 
  xlab("Station class") +
  ylab("Plastic concentration (km-2)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  facet_grid(.~size_fraction_um) +
  ggtitle("Trawl concentrations by station type and water body")

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

## Warning: Removed 80 rows containing missing values (geom_point).

2.3.3 Summary tables by station type

## Long format of no fiber data with only 1000-4750um

trawl.prop.conc.class.a <- 
  subset(gl.all.trawl.data.nonnumeric.long,
         conc_type != 'fiber_conc_km2' & size_fraction_um == '1000-4750')

trawl.prop.conc.class.b <- 
  na.omit(trawl.prop.conc.class.a)

trawl.prop.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        trawl.prop.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

trawl.prop.conc.class.c <-
  aggregate(data = trawl.prop.conc.class.b,
            concentration~station_class+conc_type,
            sum, 
            na.action = na.exclude)

trawl.prop.conc.class.d <-
  dcast(trawl.prop.conc.class.c,
        station_class~conc_type,
        value.var = "concentration",
        fill = NA_real_)

trawl.prop.conc.class.e <-
  cbind(trawl.prop.conc.class.d$'station_class', 
        trawl.prop.conc.class.d[,2:8]/trawl.prop.conc.class.d$'total_conc_km2')

conc.type.col.names.class <- c("Station type", 
                         "Fragment", 
                         "Film", 
                         "Line", 
                         "Nurdle", 
                         "Sphere", 
                         "Foam", 
                         "Paint")

colnames(trawl.prop.conc.class.e) <- 
  conc.type.col.names.class

trawl.prop.conc.class.e$n <- 
  trawl.prop.conc.class.count$total_conc_km2

xtable.trawl.prop.conc.class.e <- 
  xtable(trawl.prop.conc.class.e, 
         digits = 3, 
         caption = "Proportion of individual trawl concentrations, 1000-4750 µm fraction")

trawl.prop.conc.class <- 
  write.csv(
    trawl.prop.conc.class.e, 
    file = "./outfiles_final/final_proportion_ind_trawl_conc_midsize_by_station_class_table.csv", 
    na = "NA")

trawl.prop.conc.class.f <- 
  cbind('Station type' = trawl.prop.conc.class.e[,1], 
        round(trawl.prop.conc.class.e[,-1],3))

datatable(trawl.prop.conc.class.f, 
          options = list(pageLength = 10),
          caption = "Proportion of individual trawl concentrations, 1000-4750 µm fraction"
          )

trawl.mean.conc.class.a <- 
  subset(gl.all.trawl.data.nonnumeric.long,
         size_fraction_um == '1000-4750')

trawl.mean.conc.class.b <- 
  na.omit(trawl.mean.conc.class.a)

trawl.mean.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        trawl.mean.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

trawl.mean.conc.class.c <-
  aggregate(data = trawl.mean.conc.class.b, 
            concentration~station_class+conc_type,
            mean)
 
trawl.mean.conc.class.d <-
  dcast(trawl.mean.conc.class.c,
        station_class~conc_type,
        fill = NA,
        value.var = "concentration")

conc.type.col.names.w.tot <- c("Station type", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(trawl.mean.conc.class.d) <- 
  conc.type.col.names.w.tot

trawl.mean.conc.class.d$n <- 
  trawl.mean.conc.class.count$total_conc_km2

xtable.trawl.mean.conc.class.d <- 
  xtable(trawl.mean.conc.class.d, 
         digits = 3, 
         caption = "Mean of individual trawl concentrations (km^-2), 1000-4750 µm fraction")

mean.conc.data <- 
  write.csv(
    trawl.mean.conc.class.d, 
    file = "./outfiles_final/final_mean_ind_trawl_conc_by_class_table.csv", 
    na = "NA")

trawl.mean.conc.class.e <- 
  cbind('Station type' = trawl.mean.conc.class.d[,1], 
        round(trawl.mean.conc.class.d[,-1],0))

trawl.sd.conc.class.a <- 
  subset(gl.all.trawl.data.nonnumeric.long,
         size_fraction_um == '1000-4750')

trawl.sd.conc.class.b <- 
  na.omit(trawl.sd.conc.class.a)

trawl.sd.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        trawl.sd.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

trawl.sd.conc.class.c <-
  aggregate(data = trawl.sd.conc.class.b, 
            concentration~station_class+conc_type,
            sd)
 
trawl.sd.conc.class.d <-
  dcast(trawl.sd.conc.class.c,
        station_class~conc_type,
        fill = NA,
        value.var = "concentration")

conc.type.col.names.w.tot <- c("Station type", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total plastic", 
                               "Fiber")

colnames(trawl.sd.conc.class.d) <- 
  conc.type.col.names.w.tot

trawl.sd.conc.class.d$n <- 
  trawl.sd.conc.class.count$total_conc_km2

xtable.trawl.sd.conc.class.d <- 
  xtable(trawl.sd.conc.class.d, 
         digits = 3, 
         caption = "Standard deviation of individual trawl concentrations, 1000-4750 µm fraction")

trawl.sd.conc.class.data <- 
  write.csv(
    trawl.sd.conc.class.d, 
    file = "./outfiles_final/final_sd_ind_trawl_conc_class.csv", 
    na = "NA")

trawl.sd.conc.class.e <- 
  cbind('Station type' = trawl.sd.conc.class.d[,1], 
        round(trawl.sd.conc.class.d[,-1],0))

trawl.mean.conc.class.matrix <- 
  as.matrix(
    round(
      trawl.mean.conc.class.d[,c(2:10)], 0))

trawl.sd.conc.class.matrix <- 
  as.matrix(
    round(
      trawl.sd.conc.class.d[,c(2:10)], 0))

trawl.mean.sd.conc.class <- 
  as.data.frame(
    matrix(
      paste(
        format(
          trawl.mean.conc.class.matrix), 
        format(
          trawl.sd.conc.class.matrix), 
        sep =" +/- "), 
      nrow = nrow(trawl.mean.conc.class.matrix),
      dimnames = dimnames(trawl.mean.conc.class.matrix)))
                              
trawl.mean.sd.conc.class <- cbind(
  trawl.mean.conc.class.d$`Station type`, 
  trawl.mean.sd.conc.class
  )

conc.type.col.names.w.tot <- c("Station type", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total plastic", 
                               "Fiber")

colnames(trawl.mean.sd.conc.class) <- 
  conc.type.col.names.w.tot

trawl.mean.sd.conc.class$n <- 
  trawl.mean.conc.class.count$total_conc_km2

xtable.trawl.mean.sd.conc.class <- 
  xtable(trawl.mean.sd.conc.class, 
         caption = "Mean and standard deviation of individual trawl concentration (km^-2), 1000-4750 µm fraction")

mean.sd.conc.data <- 
  write.csv(
    trawl.mean.sd.conc.class, 
    file = "./outfiles_final/final_mean_sd_trawl_midsize_conc_by_class.csv", 
    na = "NA")

datatable(trawl.mean.sd.conc.class, 
          options = list(pageLength = 10),
          caption = "Mean and standard deviation of individual trawl concentration (km^-2), 1000-4750 µm fraction"
          )

2.4 Data by station (mean of trawls at each station)

## Create a dataframe of mean concentrations by station 

table.gl.2014.conc.by.stn.size.comp.melt <- 
  melt(gl.2014.plastics.conc.df, 
       measure.vars = c('fragment_conc_km2', 
                        'film_conc_km2', 
                        'line_conc_km2', 
                        'nurdle_conc_km2', 
                        'sphere_conc_km2', 
                        'foam_conc_km2', 
                        'paint_conc_km2',
                        'total_conc_km2', 
                        'fiber_conc_km2'), 
       id.vars = c('station', 
                   'size_fraction_um'), 
       value.name = "concentration", 
       variable.name = "conc_type", 
       na.rm = FALSE)

table.gl.2014.conc.by.stn.size.comp.melt.nona <- 
  na.omit(table.gl.2014.conc.by.stn.size.comp.melt)

gl.2014.stn.conc.wide <- 
  dcast(
    table.gl.2014.conc.by.stn.size.comp.melt.nona, 
    station + size_fraction_um ~ conc_type, 
    value.var = 'concentration', 
    mean, 
    fill = NA_real_)

table.gl.2014.stn.conc.by.size.comp.long <- 
  melt(gl.2014.stn.conc.wide, 
       measure.vars = c('fragment_conc_km2', 
                        'film_conc_km2', 
                        'line_conc_km2', 
                        'nurdle_conc_km2', 
                        'sphere_conc_km2', 
                        'foam_conc_km2', 
                        'paint_conc_km2',
                        'total_conc_km2', 
                        'fiber_conc_km2'), 
       id.vars = c('station','size_fraction_um'), 
       value.name = "mean_concentration", 
       variable.name = "conc_type", 
       na.rm = FALSE)

2.4.1 Summary tables by size

stn.tot.conc.summary.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('station','size_fraction_um'), 
    function(x) summary(x$'concentration'))

stn.tot.conc.sd.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('station','size_fraction_um'), 
    function(x) sd(x$'concentration'))

stn.tot.conc.iqr.by.size <- 
  ddply(
    table.gl.2014.conc.by.size.comp.melt.nona.tot, 
    c('station','size_fraction_um'), 
    function(x) IQR(x$'concentration'))

stn.tot.conc.summary.by.size$StDev. <- 
  round(stn.tot.conc.sd.by.size$V1, 0)

stn.tot.conc.summary.by.size$IQR <- 
  round(stn.tot.conc.iqr.by.size$V1, 0)

stn.tot.conc.summary.by.size$Range <- 
  stn.tot.conc.summary.by.size$Max. - stn.tot.conc.summary.by.size$Min.

stn.tot.conc.summary.by.size$MNR <- 
  round((stn.tot.conc.summary.by.size$Range/ stn.tot.conc.summary.by.size$Mean), 3)
  
stn.tot.conc.summary.by.size$MNR[is.na(stn.tot.conc.summary.by.size$MNR)] <- NA

stn.tot.conc.summary.by.size$CoV <- 
  round((stn.tot.conc.summary.by.size$StDev./ stn.tot.conc.summary.by.size$Mean), 3)

datatable(
  stn.tot.conc.summary.by.size,
  options = list(pageLength = 20),
  caption = "Summary of station mean total plastic concentrations (km^-2)")

stn.tot.conc.summary.by.size.table <- 
  write.csv(
    stn.tot.conc.summary.by.size, 
    file = "./outfiles_final/final_data_sheet_3_stn.tot.conc.summary.by.size_table.csv",
    na = "NA")

2.4.1.1 Mean-normalized station range plots

fracpal <- c("#ebb444", "#1daf7e", "#8c1717")

ggplot(stn.tot.conc.summary.by.size, aes(y = MNR, x = size_fraction_um)) +
  theme_bw() +
  geom_boxplot(aes(fill = size_fraction_um), alpha = 0.5) +
  scale_fill_manual(values = fracpal)

## Warning: Removed 13 rows containing non-finite values (stat_boxplot).

gl.2014.trawl.conc.sum.data <- 
  merge(gl.2014.all.data, stn.tot.conc.summary.by.size, 
        by = c("station", "size_fraction_um"), 
        all.x = TRUE, 
        suffixes = c(".x", ".y"))

ggplot(gl.2014.trawl.conc.sum.data, aes(x = factor(size_fraction_um), y=MNR)) +
  scale_colour_manual(values=stationPalette, name="Station Class")+
  geom_boxplot(alpha=0.5) +
  geom_jitter(aes(color = station_class), size=1)

## Warning: Removed 86 rows containing non-finite values (stat_boxplot).

## Warning: Removed 86 rows containing missing values (geom_point).

ggplot(gl.2014.trawl.conc.sum.data, aes(y = MNR, x = station_class)) +
  scale_colour_manual(values=fracpal, name="Size Class")+
  geom_boxplot(alpha=0.5) +
  geom_jitter(aes(color = size_fraction_um), size=1)

## Warning: Removed 86 rows containing non-finite values (stat_boxplot).

## Warning: Removed 86 rows containing missing values (geom_point).

ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$MNR),], 
         aes(y = MNR, x = total_conc_km2)) + 
  geom_hline(data = gl.2014.trawl.conc.sum.data,
             aes(yintercept= 1),
             linetype = "dashed",
             size = 0.5, 
             color = "red") +
    geom_point(size = 2.0, 
               alpha = 0.85,
               aes(color = size_fraction_um, 
                   fill = size_fraction_um)) + 
  geom_smooth(method = "lm", alpha = 0.2, aes(colors = "black")) +
  scale_x_log10() + 
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,3)) + 
  scale_color_manual(values = fracpal)

## Warning: Removed 67 rows containing non-finite values (stat_smooth).

## Warning: Removed 39 rows containing missing values (geom_point).

lm_trawl_tot_mnr <- 
  lm(data = gl.2014.trawl.conc.sum.data, 
     formula = MNR ~ log10(total_conc_km2+1), 
     na.action = na.omit)

summary(lm_trawl_tot_mnr)

## 
## Call:
## lm(formula = MNR ~ log10(total_conc_km2 + 1), data = gl.2014.trawl.conc.sum.data, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50918 -0.53223  0.02943  0.34859  1.55751 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2.73684    0.10775   25.40   <2e-16 ***
## log10(total_conc_km2 + 1) -0.35671    0.02794  -12.77   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6953 on 206 degrees of freedom
##   (116 observations deleted due to missingness)
## Multiple R-squared:  0.4418, Adjusted R-squared:  0.4391 
## F-statistic:   163 on 1 and 206 DF,  p-value: < 2.2e-16

gl.2014.trawl.conc.sum.data.numeric.only<- 
  gl.2014.trawl.conc.sum.data[sapply(gl.2014.trawl.conc.sum.data, 
                                     is.numeric)]
corrtab.gl.2014.trawl.conc.sum.data <-
  rcorr(as.matrix(abs(gl.2014.trawl.conc.sum.data.numeric.only)), type = 'spearman')

corrtab.mnr.gl.2014.trawl.conc.sum.data <- 
  as.data.frame(
    cbind(rho = corrtab.gl.2014.trawl.conc.sum.data$r[,32], 
          p_value = corrtab.gl.2014.trawl.conc.sum.data$P[,32]))

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

flat.corrtab.gl.2014.trawl.conc.sum.data <-
  as.data.frame(
    flattenCorrMatrix(corrtab.gl.2014.trawl.conc.sum.data$r,
                    corrtab.gl.2014.trawl.conc.sum.data$P))

datatable(corrtab.mnr.gl.2014.trawl.conc.sum.data,
          options = list(pageLength = 10),
          caption = "Spearman's correlation of MNR vs. absolute value of metadata")

ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$MNR),], 
         aes(y = MNR, x = total_conc_km2)) +
    geom_point(size = 2, 
               alpha = 1,
               aes(color = station_class, 
                   shape = factor(size_fraction_um))) + 
  geom_smooth(method = "lm", alpha = 0.2) +
  scale_x_log10() + 
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,3)) + 
  geom_hline(data = gl.2014.trawl.conc.sum.data,
             aes(yintercept= 1),
             linetype = "dashed",
             size = 0.5, 
             color = "red")

## Warning: Removed 67 rows containing non-finite values (stat_smooth).

## Warning: Removed 39 rows containing missing values (geom_point).

gl.2014.trawl.conc.sum.data <- 
  merge(gl.2014.all.data, stn.tot.conc.summary.by.size, 
        by = c("station", "size_fraction_um"), 
        all.x = TRUE, 
        suffixes = c(".x", ".y"))

  ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$MNR),],aes(y = MNR, x = Mean, color = station_class)) +
   geom_point(aes(color = factor(station_class)),
                  size = 0.8,
                  alpha = 0.5) + 
  geom_smooth(method = "lm", alpha = 0.2) +
  scale_x_log10() +
  facet_grid(.~ size_fraction_um, scales="free") +
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,3))

## Warning: Removed 9 rows containing non-finite values (stat_smooth).

## Warning: Removed 9 rows containing missing values (geom_point).

## Warning: Removed 17 rows containing missing values (geom_smooth).

table.gl.2014.stn.conc.by.size.comp.long.nona <- 
  na.omit(table.gl.2014.stn.conc.by.size.comp.long)

stn.prop.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.stn.conc.by.size.comp.long.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.stn.conc.by.size.comp.sum <-
  aggregate(data = table.gl.2014.stn.conc.by.size.comp.long.nona,
            mean_concentration~size_fraction_um+conc_type,
            sum, 
            na.action = na.exclude)
 
table.gl.2014.stn.conc.by.size.comp.sum.cast <-
  dcast(table.gl.2014.stn.conc.by.size.comp.sum,
        size_fraction_um~conc_type,
        value.var = "mean_concentration",
        fill = NA_real_)

table.gl.2014.stn.conc.totals.by.size.comp.prop <-
  cbind(table.gl.2014.stn.conc.by.size.comp.sum.cast$'size_fraction_um', 
        table.gl.2014.stn.conc.by.size.comp.sum.cast[,2:8]/table.gl.2014.stn.conc.by.size.comp.sum.cast$'total_conc_km2')

conc.type.col.names <- c("Size (µm)", 
                         "Fragment", 
                         "Film", 
                         "Line", 
                         "Nurdle", 
                         "Sphere", 
                         "Foam", 
                         "Paint")

colnames(table.gl.2014.stn.conc.totals.by.size.comp.prop) <- 
  conc.type.col.names

table.gl.2014.stn.conc.totals.by.size.comp.prop$n <- 
  stn.prop.conc.size.count$total_conc_km2

xtable.prop.stn.conc.types <- 
  xtable(table.gl.2014.stn.conc.totals.by.size.comp.prop, 
         digits = 3, 
         caption = "Proportion of mean station concentration")

prop.stn.conc.data <- 
  write.csv(
    table.gl.2014.stn.conc.totals.by.size.comp.prop, 
    file = "./outfiles_final/final_proportion_mean_stn_conc_by_size_table.csv", 
    na = "NA")

table.gl.2014.stn.conc.totals.by.size.comp.prop.round <- 
  cbind('Size (µm)' = table.gl.2014.stn.conc.totals.by.size.comp.prop[,1], 
        round(table.gl.2014.stn.conc.totals.by.size.comp.prop[,-1],3))

datatable(table.gl.2014.stn.conc.totals.by.size.comp.prop.round, 
          options = list(pageLength = 3),
          caption = "Proportion of mean station concentration"
          )

table.gl.2014.stn.conc.by.size.comp.long.nona <- 
  na.omit(table.gl.2014.stn.conc.by.size.comp.long)

stn.mean.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.stn.conc.by.size.comp.long.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.stn.conc.by.size.comp.mean <-
  aggregate(data = table.gl.2014.stn.conc.by.size.comp.long.nona, 
            mean_concentration~size_fraction_um+conc_type,
            mean)
 
table.gl.2014.stn.conc.by.size.comp.mean.cast <-
  dcast(table.gl.2014.stn.conc.by.size.comp.mean,
        size_fraction_um~conc_type,
        fill = NA,
        value.var = "mean_concentration")

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total plastic", 
                               "Fiber")

colnames(table.gl.2014.stn.conc.by.size.comp.mean.cast) <- 
  conc.type.col.names.w.tot

table.gl.2014.stn.conc.by.size.comp.mean.cast$n <- 
  stn.mean.conc.size.count$total_conc_km2

xtable.mean.stn.conc.types <- 
  xtable(table.gl.2014.stn.conc.by.size.comp.mean.cast, 
         digits = 3, 
         caption = "Mean of station mean concentration (km^-2)")

mean.stn.conc.data <- 
  write.csv(
    table.gl.2014.stn.conc.by.size.comp.mean.cast, 
    file = "./outfiles_final/final_mean_mean_stn_conc_by_size_table.csv",
    na = "NA")

table.gl.2014.stn.conc.by.size.comp.mean.cast.round <- 
  cbind('Size (µm)' = table.gl.2014.stn.conc.by.size.comp.mean.cast[,1], 
        round(table.gl.2014.stn.conc.by.size.comp.mean.cast[,-1],3))

table.gl.2014.stn.conc.by.size.comp.long.nona <- 
  na.omit(table.gl.2014.stn.conc.by.size.comp.long)

stn.sd.conc.size.count <- 
  dcast(
    as.data.frame(
      table(
        table.gl.2014.stn.conc.by.size.comp.long.nona[,c('size_fraction_um', 'conc_type')]
        )
      ),
    size_fraction_um~conc_type
  )

## Using Freq as value column: use value.var to override.

table.gl.2014.stn.conc.by.size.comp.sd <-
  aggregate(data = table.gl.2014.stn.conc.by.size.comp.long.nona, 
            mean_concentration~size_fraction_um+conc_type,
            sd)
 
table.gl.2014.stn.conc.by.size.comp.sd.cast <-
  dcast(table.gl.2014.stn.conc.by.size.comp.sd,
        size_fraction_um~conc_type,
        fill = NA,
        value.var = "mean_concentration")

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(table.gl.2014.stn.conc.by.size.comp.sd.cast) <- 
  conc.type.col.names.w.tot

table.gl.2014.stn.conc.by.size.comp.sd.cast$n <- 
  stn.sd.conc.size.count$total_conc_km2

xtable.sd.stn.conc.types <- 
  xtable(table.gl.2014.stn.conc.by.size.comp.sd.cast, 
         digits = 3, 
         caption = "Standard deviation of station mean concentrations")

sd.stn.conc.data <- 
  write.csv(
    table.gl.2014.stn.conc.by.size.comp.sd.cast, 
    file = "./outfiles_final/final_sd_mean_stn_conc_by_size_table.csv",
    na = "NA")

table.gl.2014.stn.conc.by.size.comp.sd.cast.round <- 
  cbind('Size (µm)' = table.gl.2014.stn.conc.by.size.comp.sd.cast[,1], 
        round(table.gl.2014.stn.conc.by.size.comp.sd.cast[,-1],3))

stn.mean.conc.size.matrix <- 
  as.matrix(
    round(
      table.gl.2014.stn.conc.by.size.comp.mean.cast[,-1],0
      )
    )

stn.sd.conc.size.matrix <- 
  as.matrix(
    round(
      table.gl.2014.stn.conc.by.size.comp.sd.cast[,-1],0
      )
    )

stn.conc.size.mean.sd <- 
  as.data.frame(
    matrix(
      paste(
        format(stn.mean.conc.size.matrix), 
        format(stn.sd.conc.size.matrix), 
        sep =" +/- "), 
      nrow = nrow(stn.mean.conc.size.matrix),
      dimnames = dimnames(stn.mean.conc.size.matrix)))
                              
stn.conc.size.mean.sd <- 
  cbind(
    table.gl.2014.stn.conc.by.size.comp.mean.cast$`Size (µm)`, 
    stn.conc.size.mean.sd
)

conc.type.col.names.w.tot <- c("Size (µm)", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total plastic", 
                               "Fiber")

colnames(stn.conc.size.mean.sd) <- 
  conc.type.col.names.w.tot
 
stn.conc.size.mean.sd$n <- 
  stn.mean.conc.size.count$total_conc_km2

xtable.stn.conc.size.mean.sd <- 
  xtable(stn.conc.size.mean.sd, 
         caption = "Mean and standard deviation of station mean concentration (km^-2)")

mean.sd.stn.conc.data <- 
  write.csv(
    stn.conc.size.mean.sd, 
    file = "./outfiles_final/final_mean_sd_stn_mean_conc_by_size_table.csv", 
    na = "NA")

datatable(stn.conc.size.mean.sd, 
          options = list(pageLength = 3),
          caption = "Mean and standard deviation of station mean concentration (km^-2)"
          )

2.4.1.2 CoV station range plots

ggplot(stn.tot.conc.summary.by.size, aes(y = CoV, x = size_fraction_um)) +
  geom_boxplot(aes(fill = size_fraction_um), alpha = 0.5) +
  theme_bw() +
  scale_fill_manual(values = fracpal)

## Warning: Removed 29 rows containing non-finite values (stat_boxplot).

ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$CoV),], 
         aes(y = CoV, x = total_conc_km2)) + 
  geom_hline(data = gl.2014.trawl.conc.sum.data,
             aes(yintercept= 1),
             linetype = "dashed",
             size = 0.5, 
             color = "red") +
    geom_point(size = 2.0, 
               alpha = 0.85,
               aes(color = size_fraction_um, 
                   fill = size_fraction_um)) + 
  geom_smooth(method = "lm", alpha = 0.2, aes(colors = "black")) +
  scale_x_log10() + 
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,1.75)) + 
  scale_color_manual(values = fracpal)

## Warning: Removed 35 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

lm_trawl_tot_cov <- 
  lm(data = gl.2014.trawl.conc.sum.data, 
     formula = CoV ~ log10(total_conc_km2+1), 
     na.action = na.omit)

summary(lm_trawl_tot_cov)

## 
## Call:
## lm(formula = CoV ~ log10(total_conc_km2 + 1), data = gl.2014.trawl.conc.sum.data, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.86847 -0.25702 -0.01832  0.22918  0.85496 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                1.50382    0.06068   24.79   <2e-16 ***
## log10(total_conc_km2 + 1) -0.18430    0.01644  -11.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3838 on 190 degrees of freedom
##   (132 observations deleted due to missingness)
## Multiple R-squared:  0.3981, Adjusted R-squared:  0.3949 
## F-statistic: 125.7 on 1 and 190 DF,  p-value: < 2.2e-16

ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$CoV),], 
         aes(y = CoV, x = total_conc_km2)) +
    geom_point(size = 2, 
               alpha = 1,
               aes(color = station_class, 
                   shape = factor(size_fraction_um))) + 
  geom_smooth(method = "lm", alpha = 0.2) +
  scale_x_log10() + 
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,1.75)) + 
  geom_hline(data = gl.2014.trawl.conc.sum.data,
             aes(yintercept= 1),
             linetype = "dashed",
             size = 0.5, 
             color = "red")

## Warning: Removed 35 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

  ggplot(gl.2014.trawl.conc.sum.data[!is.na(gl.2014.trawl.conc.sum.data$CoV),],
         aes(y = CoV, 
             x = Mean, 
             color = station_class)) +
   geom_point(aes(color = factor(station_class)),
                  size = 0.8,
                  alpha = 0.5) + 
  geom_smooth(method = "lm", alpha = 0.2) +
  scale_x_log10() +
  facet_grid(.~ size_fraction_um, scales="free") +
  theme(axis.text.x = element_text(size = 6, angle = 45, hjust = 1)) +
  scale_y_continuous(limits = c(0,1.75))

## Warning: Removed 11 rows containing missing values (geom_smooth).

2.4.2 Summary tables by station type

gl.all.trawl.data.nonnumeric.long.nona.tot <- 
   subset(gl.all.trawl.data.nonnumeric.long.nona, 
      conc_type == 'total_conc_km2')
#View(gl.all.trawl.data.nonnumeric.long.nona.tot)

stn.tot.conc.mdata.summary.by.size <- 
  ddply(
    gl.all.trawl.data.nonnumeric.long.nona.tot, 
    c('station',
      'size_fraction_um', 
      'water_body', 
      'location', 
      'station_class'), 
    function(x) summary(x$'concentration'))

stn.tot.conc.mdata.sd.by.size <- 
  ddply(
    gl.all.trawl.data.nonnumeric.long.nona.tot, 
    c('station',
      'size_fraction_um', 
      'water_body', 
      'location', 
      'station_class'), 
    function(x) sd(x$'concentration'))

stn.tot.conc.mdata.iqr.by.size <- 
  ddply(
    gl.all.trawl.data.nonnumeric.long.nona.tot, 
     c('station',
      'size_fraction_um', 
      'water_body', 
      'location', 
      'station_class'), 
    function(x) IQR(x$'concentration'))

stn.tot.conc.mdata.summary.by.size$StDev. <- 
  round(stn.tot.conc.mdata.sd.by.size$V1, 0)

stn.tot.conc.mdata.summary.by.size$IQR <- 
  round(stn.tot.conc.mdata.iqr.by.size$V1, 0)

stn.tot.conc.mdata.summary.by.size$Range <- 
  stn.tot.conc.mdata.summary.by.size$Max. - 
  stn.tot.conc.mdata.summary.by.size$Min.

datatable(
  stn.tot.conc.mdata.summary.by.size,
  options = list(pageLength = 10),
  caption = "Summary of station mean total plastic concentrations (km^-2)")

The highest single trawl concentration is 1.88810^{6} km^-2, found in the 106-1000 µm size fraction at the Detroit WWTP of Detroit Rv..

## Long format of data with only 1000-4750um

stn.prop.conc.class.a <- 
  subset(gl.all.data.nonnumeric.mean.stn.long,
         conc_type != 'fiber_conc_km2' & size_fraction_um == '1000-4750')

stn.prop.conc.class.b <- 
  na.omit(stn.prop.conc.class.a)

stn.prop.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        stn.prop.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

stn.prop.conc.class.c <-
  aggregate(data = stn.prop.conc.class.b,
            mean_concentration~station_class+conc_type,
            sum, 
            na.action = na.exclude)

stn.prop.conc.class.d <-
  dcast(stn.prop.conc.class.c,
        station_class~conc_type,
        value.var = "mean_concentration",
        fill = NA_real_)

stn.prop.conc.class.e <-
  cbind(stn.prop.conc.class.d$'station_class', 
        stn.prop.conc.class.d[,2:8]/stn.prop.conc.class.d$'total_conc_km2')

conc.type.col.names.class <- c("Station type", 
                         "Fragment", 
                         "Film", 
                         "Line", 
                         "Nurdle", 
                         "Sphere", 
                         "Foam", 
                         "Paint")

colnames(stn.prop.conc.class.e) <- 
  conc.type.col.names.class

stn.prop.conc.class.e$n <- 
  stn.prop.conc.class.count$total_conc_km2

xtable.stn.prop.conc.class <- 
  xtable(stn.prop.conc.class.e, 
         digits = 3, 
         caption = "Proportion of station mean concentrations, 1000-4750 µm fraction")

prop.stn.midsize.class.conc.data <- 
  write.csv(
    stn.prop.conc.class.e, 
    file = "./outfiles_final/final_proportion_stn_mean_conc_midsize_by_station_class_table.csv", 
    na = "NA")

stn.prop.conc.class.f <- 
  cbind('Station type' = stn.prop.conc.class.e[,1], 
        round(stn.prop.conc.class.e[,-1],3))

datatable(stn.prop.conc.class.f, 
          options = list(pageLength = 10),
          caption = "Proportion of station mean concentrations, 1000-4750 µm fraction"
          )

stn.mean.conc.class.a <- 
  subset(gl.all.data.nonnumeric.mean.stn.long,
         size_fraction_um == '1000-4750')

stn.mean.conc.class.b <- 
  na.omit(stn.mean.conc.class.a)

stn.mean.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        stn.mean.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

stn.mean.conc.class.c <-
  aggregate(data = stn.mean.conc.class.b, 
            mean_concentration~station_class+conc_type,
            mean)
 
stn.mean.conc.class.d <-
  dcast(stn.mean.conc.class.c,
        station_class~conc_type,
        fill = NA,
        value.var = "mean_concentration")

conc.type.col.names.w.tot <- c("Station type", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total Plastic", 
                               "Fiber")

colnames(stn.mean.conc.class.d) <- 
  conc.type.col.names.w.tot

stn.mean.conc.class.d$n <- 
  stn.mean.conc.class.count$total_conc_km2

xtable.stn.mean.conc.class.d <- 
  xtable(stn.mean.conc.class.d, 
         digits = 3, 
         caption = "Mean of station mean concentrations (km^-2), 1000-4750 µm fraction")

mean.conc.data <- 
  write.csv(
    stn.mean.conc.class.d, 
    file = "./outfiles_final/final_mean_stn_mean_conc_by_class_table.csv", 
    na = "NA")

stn.mean.conc.class.e <- 
  cbind('Station type' = stn.mean.conc.class.d[,1], 
        round(stn.mean.conc.class.d[,-1],0))

stn.sd.conc.class.a <- 
  subset(gl.all.data.nonnumeric.mean.stn.long,
         size_fraction_um == '1000-4750')

stn.sd.conc.class.b <- 
  na.omit(stn.sd.conc.class.a)

stn.sd.conc.class.count <- 
  dcast(
    as.data.frame(
      table(
        stn.sd.conc.class.b[,c('station_class', 'conc_type')]
        )
      ),
    station_class~conc_type
  )

## Using Freq as value column: use value.var to override.

stn.sd.conc.class.c <-
  aggregate(data = stn.sd.conc.class.b, 
            mean_concentration~station_class+conc_type,
            sd)
 
stn.sd.conc.class.d <-
  dcast(stn.sd.conc.class.c,
        station_class~conc_type,
        fill = NA,
        value.var = "mean_concentration")

conc.type.col.names.w.tot <- c("Station type", 
                               "Fragment", 
                               "Film", 
                               "Line", 
                               "Nurdle", 
                               "Sphere", 
                               "Foam", 
                               "Paint", 
                               "Total plastic", 
                               "Fiber")

colnames(stn.sd.conc.class.d) <- 
  conc.type.col.names.w.tot

stn.sd.conc.class.d$n <- 
  stn.sd.conc.class.count$total_conc_km2

xtable.stn.sd.conc.class.d <- 
  xtable(stn.sd.conc.class.d, 
         digits = 3, 
         caption = "Standard deviation of station mean concentrations, 1000-4750 µm fraction")

stn.sd.conc.class.data <- 
  write.csv(
    stn.sd.conc.class.d, 
    file = "./outfiles_final/final_sd_stn_mean_conc_class.csv", 
    na = "NA")

stn.sd.conc.class.e <- 
  cbind('Station type' = stn.sd.conc.class.d[,1], 
        round(stn.sd.conc.class.d[,-1],0))

stn.mean.conc.class.matrix <- 
  as.matrix(
    round(
      stn.mean.conc.class.d[,c(2:10)], 0))

stn.sd.conc.class.matrix <- 
  as.matrix(
    round(
      stn.sd.conc.class.d[,c(2:10)], 0))

stn.mean.sd.conc.class <- 
  as.data.frame(
    matrix(
      paste(
        format(
          stn.mean.conc.class.matrix), 
        format(
          stn.sd.conc.class.matrix), 
        sep =" +/- "), 
      nrow = nrow(stn.mean.conc.class.matrix),
      dimnames = dimnames(stn.mean.conc.class.matrix)))
                              
stn.mean.sd.conc.class <- 
  cbind(
    stn.mean.conc.class.d$`Station type`, 
    stn.mean.sd.conc.class
  )

conc.type.col.names.w.tot <- 
  c("Station type", 
    "Fragment", 
    "Film", 
    "Line", 
    "Nurdle", 
    "Sphere", 
    "Foam", 
    "Paint", 
    "Total plastic", 
    "Fiber")

colnames(stn.mean.sd.conc.class) <- 
  conc.type.col.names.w.tot

stn.mean.sd.conc.class$n <- 
  stn.mean.conc.class.count$total_conc_km2
  
# kable(stn.mean.sd.conc.class, 
#       caption = "Mean and standard deviation of station mean concentration (km^-2), 1000-4750 µm fraction")

xtable.stn.mean.sd.conc.class <- 
  xtable(stn.mean.sd.conc.class, 
         caption = "Mean and standard deviation of station mean concentration (km^-2), 1000-4750 µm fraction")

mean.sd.conc.data <- 
  write.csv(
    stn.mean.sd.conc.class, 
    file = "./outfiles_final/final_mean_sd_stn_midsize_conc_by_class.csv", 
    na = "NA")

datatable(stn.mean.sd.conc.class, 
          options = list(pageLength = 10),
          caption = "Mean and standard deviation of station mean concentration (km^-2), 1000-4750 µm fraction"
          )

3 Blank counts

Blank Size Fragment Fiber Total 1 1 1000-4750 0 32 32 2 2 1000-4750 0 33 33 3 3 1000-4750 0 8 8 4 1 100-1000 0 7 7 5 2 100-1000 1 19 20 6 3 100-1000 0 26 26 7 1 >4750 0 0 0 8 2 >4750 0 0 0 9 3 >4750 0 0 0

4 EDS data

## Import into dataframe
gl.2014.eds.calls <- 
  read.csv(
    file = "~/Box Sync/Duhaime Lab/R projects/gl_2014_count_data_github/final_directory/duhaime_gl_2014_EDS_calls.csv")

## Take out rows with no data (all spectra repeats)
gl.2014.eds.calls <- 
  na.omit(gl.2014.eds.calls)

## Create a contingency table of P/NP calls vs. suspected type

eds.calls.by.suspect <- 
  table(
    gl.2014.eds.calls[,c("suspected","final_call")])

## Run a chi-square test for independence
chisq.eds <- chisq.test(eds.calls.by.suspect)

## Warning in chisq.test(eds.calls.by.suspect): Chi-squared approximation may
## be incorrect

chisq.eds$p.value

## [1] 2.00255e-23

chisq.eds$alternative

## NULL

## Run a fisher test for the same thing, but for tables with small counts
fisher.eds <- fisher.test(eds.calls.by.suspect, hybrid = TRUE, simulate.p.value = FALSE)

fisher.eds$p.value

## [1] 9.842566e-29

fisher.eds$alternative

## [1] "two.sided"

## The p-value is very low, indicating that the data vary significantly from an even distribution, both between call types and between suspected type, but there is a warning that the results may be incorrect, considered some expected values are very close to zero
#chisq.eds$expected

## Take a look at the residuals to see how the distributions vary from the predicted "even" distribution
#chisq.eds$residuals

## It looks as if NP and M calls occur significantly more often in the nonplastic category, and the P and P-NP calls occur more often with the plastic category - yay!

I created a contingency table of a piece’s suspected type vs. type determined from spectra:

eds.calls.by.suspect

##              final_call
## suspected      M M-P NP NP-P  P
##   not plastic 35   1 46   11  7
##   plastic     10   0  2   12 76

So we can see that 76% of all pieces visually identified as plastic were confirmed as plastic, while 12% couldn’t be positively identified as plastic or not plastic by EDS. 81% of pieces were confirmed as not plastic, while only 7% of pieces were incorrectly identified as non-plastic.

To test these percentages for significance, I performed a Chi-square test of independence on the contingency table:

chisq.eds

## 
##  Pearson's Chi-squared test
## 
## data:  eds.calls.by.suspect
## X-squared = 112.63, df = 4, p-value < 2.2e-16

It looks as if the p-value is very low, indicating that the data vary significantly from an even distribution, both between call types and between suspected type, but there is a warning that the results may be incorrect, probably because some expected values are very close to zero.

chisq.eds$expected

##              final_call
## suspected        M M-P NP NP-P    P
##   not plastic 22.5 0.5 24 11.5 41.5
##   plastic     22.5 0.5 24 11.5 41.5

But when you look at the residuals of the test, it looks as if NP and M calls occur more often in the nonplastic category, and the P and P-NP calls occur more often with the plastic category - yay!

chisq.eds$residuals

##              final_call
## suspected              M        M-P         NP       NP-P          P
##   not plastic  2.6352314  0.7071068  4.4907312 -0.1474420 -5.3554386
##   plastic     -2.6352314 -0.7071068 -4.4907312  0.1474420  5.3554386

That was kind of fun, so I took a look at what effect the spectrum reader might have:

## This is kind of fun, so I want to do the same thing with the person reading the spectra and the calls that are made

eds.calls.by.reader <- 
  table(
    gl.2014.eds.calls[,c("reader","final_call")])

eds.calls.by.reader

##                           final_call
## reader                      M M-P NP NP-P  P
##   andrew                    7   0  4    0  9
##   andrew, brendan           7   1  3    1  8
##   brendan                  21   0 33   14 52
##   kaitie                    8   0  2    5  5
##   rachel, brendan, melissa  2   0  6    3  9

chisq.eds.reader <- chisq.test(eds.calls.by.reader)

## Warning in chisq.test(eds.calls.by.reader): Chi-squared approximation may
## be incorrect

chisq.eds.reader

## 
##  Pearson's Chi-squared test
## 
## data:  eds.calls.by.reader
## X-squared = 28.535, df = 16, p-value = 0.02727

chisq.eds.reader$expected

##                           final_call
## reader                        M M-P   NP NP-P    P
##   andrew                    4.5 0.1  4.8  2.3  8.3
##   andrew, brendan           4.5 0.1  4.8  2.3  8.3
##   brendan                  27.0 0.6 28.8 13.8 49.8
##   kaitie                    4.5 0.1  4.8  2.3  8.3
##   rachel, brendan, melissa  4.5 0.1  4.8  2.3  8.3

residuals(chisq.eds.reader)

##                           final_call
## reader                               M         M-P          NP        NP-P
##   andrew                    1.17851130 -0.31622777 -0.36514837 -1.51657509
##   andrew, brendan           1.17851130  2.84604989 -0.82158384 -0.85719462
##   brendan                  -1.15470054 -0.77459667  0.78262379  0.05383819
##   kaitie                    1.64991582 -0.31622777 -1.27801930  1.78032728
##   rachel, brendan, melissa -1.17851130 -0.31622777  0.54772256  0.46156633
##                           final_call
## reader                               P
##   andrew                    0.24297355
##   andrew, brendan          -0.10413152
##   brendan                   0.31175111
##   kaitie                   -1.14544672
##   rachel, brendan, melissa  0.24297355

It looks as if that pesky M-P call might be screwing with the chi-square’s ability to compare distributions, so it might be worth taking out of the dataset completely (I had an internal battle as to whether it should be left in for future comparisons).

# eds.calls.by.suspect.sample.xtab <- 
#   xtabs(
#     ~gl.2014.eds.calls$barcode_range+gl.2014.eds.calls$final_call, 
#     data = gl.2014.eds.calls$suspected,
#     sparse = FALSE)

eds.calls.by.suspect.sample <- 
  as.data.frame(
    table(
      gl.2014.eds.calls[,c("final_call", "suspected", "trawl")]))

eds.calls.by.suspect.sample$prop <- eds.calls.by.suspect.sample$Freq/10

ggplot(data = eds.calls.by.suspect.sample,
       aes(x=trawl,
           y=prop,
           color=final_call,
           fill=final_call)) +
  geom_bar(stat = "identity") +
  facet_grid(.~suspected) + 
  theme(axis.text.x = element_text(size = 10,
                                   angle = 45,
                                   hjust = 1)) + 
  theme(axis.title.x = element_blank()) +
  theme(legend.title = element_blank()) +
  ylab ("Proportion of pieces examined")

## I'm starting to play with this, but haven't really gotten far on how it can determine our "confidence" in the accuracy of our calls.

## WARNING: This package required updated packages, which require restarting R - I got and error message when I tried to require ggtern and had to manually update the package Rcpp

ggtern(
  data=gl.2014.eds.calls, 
  aes(
    nonplastic_om, 
    mineral, 
    plastic, 
    color=suspected,
    )) +
  geom_point() +
  theme_nomask()+
  geom_confidence_tern(aes(countour = TRUE))

Great Lakes Plastics 2014 Data

Rachel N. Cable & Melissa B. Duhaime

April 25, 2017