Intro

This script produces all results and figures in the manuscript ‘A novel method for using RNA-seq data to identify imprinted genes in social Hymenoptera with multiply mated queens’. It is based on the data from Li et al 2014 manuscript ‘Caste-specific RNA editomes in the leaf-cutting ant Acromyrmex echinatior’. The raw sequences can be downloaded from the NCBI GEO repositories using the accession GSE51576 (or visit https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE51576).

Qiye Li and Zongji Wang of BGI conducted the processing of the raw RNA and DNA data, to output a table of SNPs where the relative frequency of any given SNP-allele differed significantly between a paired DNA and RNA sample. This output table is used as the input here. Queries regarding the production of that table should be directed to them.

An interactive table of contents can be used to navigate to the different sections of this document, which is split into two broad parts:

  • Part 1: analysing the output of the bioinformatics
  • Part 2: producing thte figures from the sequence data and the ddPCR data

The input files can be found in the ‘input_data’ directory, and are names ‘ASE_data_frame.csv’, ‘compiled_ddPCR_results.csv’, and ‘dilution_data.csv’. This script outputs two tables, ‘table_s4.csv’ and ‘table_s5.csv’, and a pdf containing the figures for the manuscript ‘figures.pdf’.

Part 0: loading packages

All R packages used are loaded here, these are required to be installed in order for the script to run.

library(ggplot2)    #for graphics
library(splitstackshape)    #for table splitting (not sure this is needed anymore...)
library(reshape)    #for dataframe manipulation
library(tidyr) # for dataframe manipulation
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
## 
##     expand, smiths
library(tidyverse) #for dataframe manipulation
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ purrr   0.3.3     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks reshape::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::rename() masks reshape::rename()
library(ggpubr) #for better control of panels
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(RColorBrewer) #for easier control of colour

Part 1: sequencing data

Importing the Allele-specific expression dataset:

#IMPORT THE DATASET
ASE<- read.csv('../input_data/ASE_data_frame.csv')

Part 1.1: Generating confidence intervals

#Calculating the AgrestiCoul confidence intervals for the proportions
#Make the columns the correct format for calculations
ASE$Gene <- as.character(ASE$Gene)
ASE$Colony <- as.character(ASE$Colony)
ASE$Caste <- as.character(ASE$Caste)
ASE$CountDNA1 <- as.numeric(ASE$CountDNA1)
ASE$CountRNA1 <- as.numeric(ASE$CountRNA1)
ASE$DNAreadsTotal <- as.numeric(ASE$DNAreadsTotal)
ASE$RNAreadsTotal <- as.numeric(ASE$RNAreadsTotal)

##calculate the confidence intervals for the DNA counts, following Agresti-Coull paper cited in manuscript
X <- as.numeric(ASE$CountDNA1)
n <- as.numeric(ASE$DNAreadsTotal)
p <- ((X + 2) / (n + 4))
Z = 1.96 #1.96 -> 95% confidence interval
Top <- p * (1 - p)
Bottom = n + 4
Fraction <- Top / Bottom
ASE$DNAPropMin <- p - (Z * sqrt(Fraction))
ASE$DNAPropMax <- p + (Z * sqrt(Fraction))

#same with the RNA:
##Calculate the observed RNA Confidence Intervals, Agresti-Coull Method.
X <- as.numeric(ASE$CountRNA1)
n <- as.numeric(ASE$RNAreadsTotal)
p <- ((X + 2) / (n + 4))
Z = 1.96
Top <- p * (1 - p)
Bottom = n + 4
Fraction <- Top / Bottom
ASE$RNAPropMin <- p - (Z * sqrt(Fraction))
ASE$RNAPropMax <- p + (Z * sqrt(Fraction))

Part 1.2: Generating imprinting predictions

We proceed through the tests in order of increasing alpha (here we use a to denote alpha):

1 = complete patrigenic expression (a = 0);

2 = biased patrigenic expression (a = 0.2);

3 = biased matrigenic expression (a = 0.8);

4 = complete patrigenic expression (a = 1).

This could likely be shortened to a loop, but has not been done so here. For each value of a, we conduct the combinatorial tests with 3 values of q (0,0.5,1), which denote the queen genotype (Allele proportion of queen).

Part 1.2.1: alpha = 0 ie. exclusive patrigenic expression

a = 0 #set the maternal bias, 0 is complete paternal bias

#make sure the columns are defined as they should be
ASE$DNAPropMin<- as.numeric(as.character(ASE$DNAPropMin)) 
ASE$DNAPropMax<- as.numeric(as.character(ASE$DNAPropMax))
ASE$RNAPropMax<- as.numeric(as.character(ASE$RNAPropMax))
ASE$RNAPropMin<- as.numeric(as.character(ASE$RNAPropMin))

### RUN TESTS
    ##1. Patrigenic expression, queen homozygous for alternative allele.
q = 0 #queen has DNA allele frequency of 0

##Generate predictions from the DNA proportions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower (this is important for the bias section, but makes no difference here)

ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does observed RNA confidence interval  with expected? (and DNA must be lower than 0.5)
ASE$Pat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) &
      (ASE$DNAPropMin <= 0.5)

    ##2. Patrigenic expression, heterozygous
q = 0.5

    ##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval with expected (and does DNA fall between 0.75  and 0.25)
ASE$Pat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)

##3. Patrigenic expression, homozygous 2
q = 1

    ##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does obsRNA confidence interval with expected (and does DNA fall between 0.5 and 1)
ASE$Pat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Part 1.2.2: alpha = 0.2 ie. biased patrigenic expression

This section adds columns for biased patrigenic expression at alpha = 0.2

#BIASED PATRIGENIC EXPRESSION
a = 0.2 #set the maternal bias, 0.2 is 80% from the Patrigene

##1. Patrigenic expression, homozygous for alternative allele.
q = 0 #queen has DNA = 0

##Generate predictions from the DNA proportions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower limit
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper limit

##Does observed RNA confidence interval  with expected? (and DNA must be lower than 0.5)
ASE$Biased_Pat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) &
  (ASE$DNAPropMin <= 0.5)

##2. Patrigenic expression, heterozygous
q = 0.5

##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval with expected (and does DNA fall between 0.75  and 0.25)?
ASE$Biased_Pat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)

##3. Patrigenic expression, homozygous 2
q = 1

##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does obsRNA confidence interval with expected (and does DNA fall between 0.5 and 1)
ASE$Biased_Pat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Note the inflection at x = 0.5.

Part 1.2.3: alpha = 0.8 ie. biased matrigenic expression

#BIASED MATRIGENIC EXPRESSION, alpha = 0.8
a = 0.8 #set the maternal bias, 0.8 is maternal bias 

  ##1 Matrigenic expression, homozygous 1
q = 0
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be less than 0.5
ASE$Biased_Mat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.5)


## 2 Matrigenic expression, heterozygous
q = 0.5
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be between 0.25 and 0.75
ASE$Biased_Mat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)


## 3 Matrigenic expression, homozygous2
q = 1
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Biased_Mat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Part 1.2.4: alpha = 1, ie. complete matrigenic expression

#Complete MATRIGENIC EXPRESSION, alpha = 1
a = 1 #set the maternal bias, 0.8 is maternal bias 

##1. Matrigenic expression, homozygous 1
q = 0
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be less than 0.5
ASE$Mat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.5)


##2. Matrigenic expression, heterozygous
q = 0.5
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Mat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)


##3. Matrigenic expression, homozygous2
q = 1
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Mat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

The table ASE now has logical values representing whether the RNA vs DNA proportions are consistent with patrigenic or matrigenic imprinting, at all three possibilities for the queen genotype, and at complete silencing and biased expression.

Part 1.3: Combinatorial analysis, testing consistency with imprinting predictions

The imprinting analysis is built on the assumptions that all samples will be imprinted in the same direction, and that samples within a colony will share a queen (ie. common queen genotype). The tables output were used to build the tables that report imprinting results in the supplementary info.

ASE$Chr_Locus<- paste(ASE$Chr, ASE$Locus, sep = '_') #make a column to identify the loci across dif. chr, in case there are repeating loci in different chromosomes

# Run combinatorial tests for each of the sets of predictions above
test_results_df<- ASE %>% 
  group_by(Chr_Locus,Gene, Colony) %>% #for each locus, within each colony
  summarise(Complete_Paternal = all(Pat1 == T)|all(Pat2 == T)|all(Pat3 == T),  #did they all follow Patrigene predictions 1 OR all follow 2 OR all follow 3
            Biased_Paternal = all(Biased_Pat1 == T)|all(Biased_Pat2 == T)|all(Biased_Pat3 == T), #same for other alpha values
            Biased_Maternal = all(Biased_Mat1 == T)|all(Biased_Mat2 == T)|all(Biased_Mat3 == T), 
            Complete_Maternal = all(Mat1 == T)|all(Mat2 == T)|all(Mat3 == T)) %>%
  group_by(Chr_Locus, Gene) %>% #then across colonies (only grouping by locus)
  summarise(Complete_Paternal = all(Complete_Paternal == T), Biased_Paternal = all(Biased_Paternal == T),  #were all colonies consistent with paternal
            Biased_Maternal = all(Biased_Maternal == T), Complete_Maternal = all(Complete_Maternal == T)) #were all colonies consistent with maternal

#find those loci where all samples were consistent with different imprinting scenarios, by subsetting from the tables just generated
paternal_complete<- subset(test_results_df, Complete_Paternal == TRUE)
paternal_biased<- subset(test_results_df, Biased_Paternal == TRUE)
maternal_biased<- subset(test_results_df, Biased_Maternal == TRUE)
maternal_complete<- subset(test_results_df, Complete_Maternal == TRUE)

# pull out the complete data for each of the imprinting scenarios
complete_paternal_data<- unique(subset(ASE, Chr_Locus %in% paternal_complete$Chr_Locus))
complete_maternal_data<- unique(subset(ASE, Chr_Locus %in% maternal_complete$Chr_Locus))
biased_paternal_data<- unique(subset(ASE, Chr_Locus %in% paternal_biased$Chr_Locus))
biased_maternal_data<- unique(subset(ASE, Chr_Locus %in% maternal_biased$Chr_Locus))

#simplify so it's easy to read
output_table_paternal_complete<- unique(complete_paternal_data[c('Chr','Locus','Gene')])
output_table_maternal_complete<- unique(complete_maternal_data[c('Chr','Locus','Gene')])
output_table_paternal_biased<- unique(biased_paternal_data[c('Chr','Locus','Gene')])
output_table_maternal_biased<- unique(biased_maternal_data[c('Chr','Locus','Gene')])

#then can be output to tables if required 
#write.table(file = 'paternal_output_complete.csv', output_table_paternal_complete, quote = F, row.names = F, sep = ';')
#write.table(file = 'paternal_output_biased.csv', output_table_paternal_biased, quote = F, row.names = F, sep = ';')
#write.table(file = 'maternal_output_biased.csv', output_table_maternal_biased, quote = F, row.names = F, sep = ';')
#write.table(file = 'maternal_output_complete.csv', output_table_maternal_complete, quote = F, row.names = F, sep = ';')

Part 1.4: Results reported in manuscript

This section generates the results reported in the manuscript text regarding the number of loci under different scenarios

#numbers for manuscript
# how many loci/genes/etc total? and for different imprinting scenarios?
# number of loci overall: 
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Biased_Paternal == TRUE | Biased_Maternal == TRUE | Complete_Maternal == TRUE)$Chr_Locus))
## [1] 92
#number of genes overall:
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Biased_Paternal == TRUE | Biased_Maternal == TRUE | Complete_Maternal == TRUE)$Gene))
## [1] 43
# number of loci overall complete silencing: 
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Complete_Maternal == TRUE)$Chr_Locus))
## [1] 63
#number of genes overall:
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Complete_Maternal == TRUE)$Gene))
## [1] 34
# number of loci for complete paternal:
length(unique(subset(test_results_df, Complete_Paternal == TRUE)$Chr_Locus))
## [1] 47
# number of genes complete paternal:
length(unique(subset(test_results_df, Complete_Paternal == TRUE)$Gene))
## [1] 31
# number of loci for biased paternal:
length(unique(subset(test_results_df, Biased_Paternal == TRUE)$Chr_Locus))
## [1] 59
# number of genes biased paternal:
length(unique(subset(test_results_df, Biased_Paternal == TRUE)$Gene))
## [1] 36
# number of loci complete maternal:
length(unique(subset(test_results_df, Complete_Maternal == TRUE)$Chr_Locus))
## [1] 25
# number of genes complete maternal:
length(unique(subset(test_results_df, Complete_Maternal == TRUE)$Gene))
## [1] 10
# number of loci biased maternal:
length(unique(subset(test_results_df, Biased_Maternal == TRUE)$Chr_Locus))
## [1] 55
# number of genes biased maternal:
length(unique(subset(test_results_df, Biased_Maternal == TRUE)$Gene))
## [1] 24

Part 1.5: Tables, figs and results for supplementary info

Part 1.5.1: supplementary table S4: how many loci per gene/scaffold passed for different values of alpha, and how many loci were there in total?

df_0<- output_table_paternal_complete #simpler names for dataframes df_0 means alpha = 0, df_02 means alpha = 0.2, etc.
df_02<- output_table_paternal_biased
df_08<- output_table_maternal_biased
df_1<- output_table_maternal_complete

#Run for the varying levels of alpha (imprinting bias). 
## alpha = 0
df_0<- df_0 %>% #for alpha = 0 table
  group_by(Gene, Chr) %>% #group by each gene on each chromosome
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_'))) #how many loci were there, and make a gene_Chr variable

ASE_subset_0<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>% #create the same gene_Chr variable for merging
  subset(Gene_Chr %in% df_0$Gene_Chr) %>% #subset to only those genes that passed at alpha = 0
  group_by(Gene_Chr) %>% #for each gene
  summarise(count = length(unique(Locus))) #how many loci in total

df_0<- merge(df_0, ASE_subset_0, by = 'Gene_Chr') #merge the two

df_0$alpha_0 <- df_0$count.x #how many showed imprinting?
df_0$total <- df_0$count.y #how many were there?

# This basic structure is repeated for the different alpha values
## alpha = 0.2
df_02<- df_02 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_02<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_02$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_02 <- merge(df_02, ASE_subset_02, by = 'Gene_Chr')

df_02$alpha_02 <- df_02$count.x
df_02$total <- df_02$count.y

## alpha = 0.8
df_08<- df_08 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_08<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_08$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_08 <- merge(df_08, ASE_subset_08, by = 'Gene_Chr')

df_08$alpha_08 <- df_08$count.x
df_08$total <- df_08$count.y

## alpha = 1
df_1<- df_1 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_1<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_1$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_1 <- merge(df_1, ASE_subset_1, by = 'Gene_Chr')
df_1$alpha_1 <- df_1$count.x
df_1$total <- df_1$count.y

#select the useful columns
df_0 <- df_0[,c(2,3,6,7)]
df_02 <- df_02[,c(2,3,6,7)]
df_08 <- df_08[,c(2,3,6,7)]
df_1 <- df_1[,c(2,3,6,7)]

#create an output table by merging the outputs from the different alpha tables
df_out<- merge(df_0, df_02, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out<- merge(df_out, df_08, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out<- merge(df_out, df_1, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out[is.na(df_out)]<- 0 #replace the nas with 0s
df_out<- df_out[c(1,2,4:7,3)] #take the interesting columns 

#can be saved to table:
write.table(df_out, '../output/table_S4.csv',sep = ';', quote = F, row.names = F)
#head(df_out) #print the merged output

Part 1.5.2: supplementary table S5: sample consistency

# How many samples were consistent with imprinting across all the loci with ASE? This is the same table as test_results_df, but with number of samples
test_results_df_numbers<- ASE %>% 
  group_by(Chr_Locus,Colony) %>% # for each locus and colony 
  summarise(Complete_Paternal = max(sum(Pat1 == T), sum(Pat2 == T), sum(Pat3 == T)), #take the maximum total of samples that passed the tests, 3 if colony is consistent with imprinting
            Biased_Paternal = max(sum(Biased_Pat1 == T), sum(Biased_Pat2 == T), sum(Biased_Pat3 == T)), #and for other values of alpha
            Biased_Maternal = max(sum(Biased_Mat1 == T), sum(Biased_Mat2 == T), sum(Biased_Mat3 == T)), 
            Complete_Maternal = max(sum(Mat1 == T), sum(Mat2 == T), sum(Mat3 == T))
            ) %>%
  group_by(Chr_Locus) %>% #then for each locus
  summarise(Complete_Paternal = sum(Complete_Paternal), Biased_Paternal = sum(Biased_Paternal), #sum up each of the values to get total for matrigenic and patrigenic (9 if locus consistent)
            Biased_Maternal = sum(Biased_Maternal), Complete_Maternal = sum(Complete_Maternal))

## how many genes with only 1 locus
loci<- ASE %>%
  group_by(Gene) %>%
  summarise(loci = length(unique(Locus))) #summarise by number of loci
one_locus<- subset(loci, loci == 1) #take only those with  1
length(one_locus$Gene) #how many are there
## [1] 43
### figuring out how many samples were consistent/inconsistent for each locus
any_imprinted<- subset(test_results_df_numbers, Complete_Paternal == 9 | Biased_Paternal == 9 | Biased_Maternal == 9 | Complete_Maternal == 9  ) #loci where all 9 were consistent with one of the imprinting scenarios

# find out all loci for the gene
ASE<- ASE %>%
  mutate(Chr_Gene = paste(Chr, Gene, sep = '_')) #make a scaffold by gene column (for those cases where the same gene is split over scaffolds)
  
gene_list<- ASE %>%
  subset(Chr_Locus %in% any_imprinted$Chr_Locus) #make a list of all the genes where 1 locus imprinted

any_locus_within_gene_imprinted<- ASE %>%
  subset(Chr_Gene %in% gene_list$Chr_Gene) #get a subset of ASE for all genes where ≥1 locus imprinted (this has data for all the samples)

#how many samples were inconsistent with imprinting for each alpha?
inconsistent_samples<- test_results_df_numbers %>%
  subset(Chr_Locus %in% any_locus_within_gene_imprinted$Chr_Locus) %>%
  mutate(Complete_Paternal = 9 - Complete_Paternal, Biased_Paternal = 9 - Biased_Paternal, Biased_Maternal = 9 - Biased_Maternal, Complete_Maternal = 9 - Complete_Maternal)

ASE_simple<- ASE[c('Chr_Locus','Gene','Chr','Locus')] #make a readable ASE table
df<- unique(merge(ASE_simple, inconsistent_samples, by = 'Chr_Locus')[-1]) #merge so we have the gene names

#make summary table about inconsistent samples, how many genes where x samples were inconsistent?
summary_inconsistent_samples<- 
  df %>%
  group_by(Gene, Chr) %>%
  summarise(alpha_0 = sum(Complete_Paternal), alpha_02 = sum(Biased_Paternal), alpha_08 = sum(Biased_Maternal),alpha_1 = sum(Complete_Maternal)) %>%
  pivot_longer(cols = starts_with('alpha'), values_to = "Samples") %>%
  group_by(Samples) %>%
  summarise(alpha_0 = sum(name == 'alpha_0'),alpha_02 = sum(name == 'alpha_02'),alpha_08 = sum(name == 'alpha_08'),alpha_1 = sum(name == 'alpha_1'))

write.csv(df, '../output/table_S5.csv', quote =F, row.names = F)

Part 1.5.3 supplementary fig S1

for_plot<- summary_inconsistent_samples %>%
  pivot_longer(cols = starts_with('alpha'), values_to = 'count') %>%
  mutate(name = parse_number(name)) %>%
  mutate(name = case_when(name == 8 ~ 0.8,
                          name == 2 ~ 0.2,
                          name == 0 ~ 0,
                          name == 1 ~ 1
                          )) %>%
  subset(Samples != 0)
for_plot<- unique(for_plot)

p <- ggplot(for_plot, aes(x = Samples, y = count, group = name, colour = as.factor(name)))
p <- p + scale_x_log10()
p <- p + geom_point(size = 4, alpha = 0.6, position = position_jitter(height = 0, width = 0.01))
p <- p + theme_minimal()
p <- p + scale_colour_viridis_d(name = 'Alpha')
p <- p + xlab("Samples inconsistent within gene") + ylab("Count")
p <- p + labs(title = 'Figure S1') 

#pdf('Fig. S1.pdf', useDingbats = F)
p