Introduction
This vignette will show you how to run the interaction analyses using
Haplin
package.
Reading data
First, we need to read two datasets: DNA methylation and genotype. We’ll use simulated data here.
library(HaplinMethyl)
#> Loading required package: Haplin
ex_path <- system.file("extdata", package = "HaplinMethyl")
ex_file <- "env_data_test.dat"
ex_out_file <- "dnam_ex"
dnam_ex <- envDataRead(
file.in = ex_file,
dir.in = ex_path,
file.out = ex_out_file,
sep = " ",
overwrite = TRUE
)
#> The output file(s) exist!
#> Reading the data in chunks...
#> -- chunk 1--
#> -- chunk 2--
#> ... done reading.
#> Preparing data...
#> ... done preparing
#> Saving data...
#> ... saved to file: ./dnam_ex_env.ffData
ex_gen_file <- "sim1"
gen_ex <- genDataLoad(
filename = ex_gen_file,
dir.in = ex_path
)
Let’s take a look at it
dnam_ex
#> This is continuous environmental data read in by 'envDataRead'
#> with 400 columns
#> and 200 rows.
showRaw(dnam_ex)
#> opening ff /tmp/RtmpHoavNS/ff/ff52f9d3308edaf.ff
#> cg1 cg2 cg3 cg4 cg5
#> id1 0.2045598 0.38376996 0.76001042 0.8731273 0.7625753
#> id2 0.2593365 0.41689855 0.94649631 0.9673046 0.4619886
#> id3 0.9912362 0.91398561 0.07621296 0.3841574 0.7026818
#> id4 0.4740415 0.04124038 0.56624919 0.4589275 0.4461102
#> id5 0.7989879 0.52987689 0.66167605 0.1056240 0.3917910
#> attr(,"Csingle")
#> [1] TRUE
gen_ex
#> This is raw genetic data read in through genDataRead.
#>
#> It contains the following parts:
#> cov.data, gen.data, aux
#>
#> with following dimensions:
#> - covariate variables = cov.1
#> (total 1 covariate variables),
#> - number of markers = 1 ,
#> - number of data lines = 200
showGen(gen_ex)
#> Warning in showGen(gen_ex): It appears that your data has less markers (1) than
#> requested (5), adjusting accordingly.
#> opening ff /tmp/RtmpHoavNS/ff/tmp/RtmpekQIls/ff/ff164011e6d830.ff
#> ff (open) byte length=30 (30) dim=c(5,6) dimorder=c(1,2) levels: 1 2
#> l_m1_a_m l_m1_b_m l_m1_a_f l_m1_b_f l_m1_a_c l_m1_b_c
#> [1,] 1 1 2 1 1 1
#> [2,] 1 1 2 2 2 1
#> [3,] 1 2 1 1 1 2
#> [4,] 1 1 2 2 2 1
#> [5,] 1 2 2 2 2 1
The genotype in the example is coded as 1 and 2 instead of a specific base letter (e.g., A and T, or A and C) and there is only one marker. The order of the columns is as follows: maternal allele 1 (A1), maternal A2, paternal A1, paternal A2, child’s A1, and child’s A2.
Creating the strata
Let’s assume that we know that the first three methylation sites are near the marker that is in the genotype dataset. Thus, we need to extract those methylation sites first.
dnam_ex_subset <- envDataSubset(
env.data = dnam_ex,
col.ids = 1:3,
file.out = "dnam_subset"
)
#> Will select 3 columns.
#> Saving data...
#> ... saved to files: ./dnam_subset_env.ffData, ./dnam_subset_env.RData
dnam_ex_subset
#> This is continuous environmental data read in by 'envDataRead'
#> with 3 columns
#> and 200 rows.
Next, let’s create one variable that gives division of the samples into strata based on the summarized methylation level over the three chosen methylation sites.
dnam_ex_cat <- envDataCategorize(
env.data = dnam_ex_subset,
breaks = 3,
file.out = "dnam_cat"
)
#> opening ff /tmp/RtmpHoavNS/ff/ff52f9d356db393.ff
#> Creating categories: 1,2,3
#> Saving data...
#> ... saved to file: ./dnam_cat_gen.ffData
dnam_ex_cat
#> This is categorical environmental data read in by 'envDataRead'
#> with 1 columns
#> and 200 rows.
showRaw(dnam_ex_cat)
#> opening ff /tmp/RtmpHoavNS/ff/ff52f9d6918ae3d.ff
#> [,1]
#> id1 2
#> id2 2
#> id3 3
#> id4 1
#> id5 3
#> attr(,"vmode")
#> [1] byte
#> Levels: 1 2 3
Adding the strata variable
Now, the strata variable needs to be added to the genotype dataset.
NB: be careful to check that the order of the samples is the same in the stratified DNA methylation data as in the genotype data!
new_strata <- showRaw(dnam_ex_cat, rows = 1:nrows(dnam_ex_cat))
gen_ex_strat <- addCovar(
data.in = gen_ex,
covar = new_strata,
c.name = "dnam_c"
)
gen_ex_strat
#> This is raw genetic data read in through genDataRead.
#>
#> It contains the following parts:
#> cov.data, gen.data, aux
#>
#> with following dimensions:
#> - covariate variables = cov.1, dnam_c
#> (total 2 covariate variables),
#> - number of markers = 1 ,
#> - number of data lines = 200
showPheno(gen_ex_strat)
#> cov.1 dnam_c
#> id1 "1" "2"
#> id2 "1" "2"
#> id3 "1" "3"
#> id4 "1" "1"
#> id5 "1" "3"
Preparing genotype data
Haplin requires pre-processing of the genotype data before any analysis can be made.
gen_ex_strat_prep <- genDataPreprocess(
data.in = gen_ex_strat,
file.out = "gen_strat_prep"
)
#> Reading the marker names...
#> Warning: No map file given, map file empty or the number of map file rows not
#> equal to the number of markers in data; will generate dummy marker names.
#> ...done
#> Recoding covariate data...
#> ...done
#> Recoding genetic data (no. of loci: 1)...
#> ...running on only one CPU core! This may take some time...
#> ...checking alleles per SNP...
#> ...done, all alleles: 1 2
#> ...recoding SNPs...
#> ...done
#> Saving data...
#> ... saved to files: ./gen_strat_prep_gen.ffData , ./gen_strat_prep_gen.RData
gen_ex_strat_prep
#>
#> This is preprocessed data, ready for haplin analysis.
#>
#> It contains the following parts:
#> cov.data, gen.data, aux
#>
#> with following dimensions:
#> - number of covariate variables = 2 , - number of markers = 1 , - number of individuals/families = 200
Run the analysis
Gene-methylation interactions
Finally, we can run the analysis by using
haplinStrat
g_x_me_results <- haplinStrat(
data = gen_ex_strat_prep,
strata = 2
)
#>
#> ## Running haplinStrat ##
#>
#> Selected stratification variable: dnam_c.c
#> Frequency distribution of stratification variable:
#> 1 2 3
#> 67 66 67
#>
#> Running Haplin on full data file...
#> opening ff /tmp/RtmpHoavNS/ff/ff52f9d6e480ef5.ff
#> Done
#>
#> Running Haplin on stratum "1"...Done
#>
#> Running Haplin on stratum "2"...Done
#>
#> Running Haplin on stratum "3"...Done
The resulting object is a list of results, with estimates per stratum and for pooled sample.
names(g_x_me_results)
#> [1] "all" "1" "2" "3"
g_x_me_results$all
#> This is the result of a haplin run.
#> Number of data lines used: 200 | Number of haplotypes used: 2
#> Please use the "summary", "plot", "haptable" or "output" functions to obtain
#> more details.
haptable(g_x_me_results)
#> stratum row.str marker alleles counts HWE.pv Original After.rem.NA
#> 1 all 1 m1 1/2 607/593 0.3708224 200 200
#> 2 all 2 <NA> <NA> <NA> NA 200 200
#> 3 1 1 m1 1/2 178/224 0.4912064 67 67
#> 4 1 2 <NA> <NA> <NA> NA 67 67
#> 5 2 1 m1 1/2 206/190 0.6861466 66 66
#> 6 2 2 <NA> <NA> <NA> NA 66 66
#> 7 3 1 m1 1/2 223/179 0.1415450 67 67
#> 8 3 2 <NA> <NA> <NA> NA 67 67
#> After.rem.Mend.inc. After.rem.unused.haplos pv.overall haplos haplofreq
#> 1 200 200 0.3249067 1 0.5177906
#> 2 200 200 0.3249067 2 0.4822094
#> 3 67 67 0.4425462 1 0.4919781
#> 4 67 67 0.4425462 2 0.5080219
#> 5 66 66 0.4570745 1 0.5309276
#> 6 66 66 0.4570745 2 0.4690724
#> 7 67 67 0.3942638 1 0.5306729
#> 8 67 67 0.3942638 2 0.4693271
#> haplofreq.lower haplofreq.upper reference RR.est. RR.lower RR.upper
#> 1 0.4687235 0.5660839 ref 1.0000000 1.0000000 1.000000
#> 2 0.4339161 0.5312765 - 0.8789336 0.5957902 1.300486
#> 3 0.4080047 0.5767633 ref 1.0000000 1.0000000 1.000000
#> 4 0.4232367 0.5919953 - 1.4932158 0.6929969 3.201399
#> 5 0.4456386 0.6133251 ref 1.0000000 1.0000000 1.000000
#> 6 0.3866749 0.5543614 - 0.7933176 0.4063251 1.553061
#> 7 0.4458306 0.6119415 ref 1.0000000 1.0000000 1.000000
#> 8 0.3880585 0.5541694 - 0.6609049 0.3473787 1.256858
#> RR.p.value RRdd.est. RRdd.lower RRdd.upper RRdd.p.value
#> 1 NA 1.0000000 1.0000000 1.000000 NA
#> 2 0.5176644 1.1551660 0.6705660 1.980248 0.6004552
#> 3 NA 1.0000000 1.0000000 1.000000 NA
#> 4 0.2970941 1.8696232 0.6871265 5.107933 0.2111747
#> 5 NA 1.0000000 1.0000000 1.000000 NA
#> 6 0.4981511 1.1573301 0.4504226 2.912177 0.7623507
#> 7 NA 1.0000000 1.0000000 1.000000 NA
#> 8 0.2071984 0.8026436 0.3118921 2.019629 0.6384801
plot(g_x_me_results)
To check the significance of the interaction, we need to use
gxe
function
gxe(g_x_me_results)
#> gxe.test chisq df pval
#> 1 haplo.freq 0.5026134 2 0.7777838
#> 2 child 2.9488102 4 0.5664281
#> 3 haplo.freq.trend 0.3731476 1 0.5412935
#> 4 child.trend 2.6559459 2 0.2650139
The p-values here show that the interaction was not significant.
Parent-of-origin-methylation interactions
Similarly, we can also check for interaction between the methylation level and parent-of-origin effect of the genetic marker.
poo_x_me_results <- haplinStrat(
data = gen_ex_strat_prep,
strata = 2,
poo = TRUE
)
#> Warning: Can only (for the time being) use reference = "ref.cat" or
#> "population" when poo == TRUE. Has been changed to "ref.cat".
#>
#> ## Running haplinStrat ##
#>
#> Selected stratification variable: dnam_c.c
#> Frequency distribution of stratification variable:
#> 1 2 3
#> 67 66 67
#>
#> Running Haplin on full data file...Done
#>
#> Running Haplin on stratum "1"...Done
#>
#> Running Haplin on stratum "2"...Done
#>
#> Running Haplin on stratum "3"...Done
haptable(poo_x_me_results)
#> stratum row.str marker alleles counts HWE.pv Original After.rem.NA
#> 1 all 1 m1 1/2 607/593 0.3708224 200 200
#> 2 all 2 <NA> <NA> <NA> NA 200 200
#> 3 1 1 m1 1/2 178/224 0.4912064 67 67
#> 4 1 2 <NA> <NA> <NA> NA 67 67
#> 5 2 1 m1 1/2 206/190 0.6861466 66 66
#> 6 2 2 <NA> <NA> <NA> NA 66 66
#> 7 3 1 m1 1/2 223/179 0.1415450 67 67
#> 8 3 2 <NA> <NA> <NA> NA 67 67
#> After.rem.Mend.inc. After.rem.unused.haplos pv.overall haplos haplofreq
#> 1 200 200 0.4215675 1 0.5176705
#> 2 200 200 0.4215675 2 0.4823295
#> 3 67 67 0.6427490 1 0.4930795
#> 4 67 67 0.6427490 2 0.5069205
#> 5 66 66 0.6552084 1 0.5308702
#> 6 66 66 0.6552084 2 0.4691298
#> 7 67 67 0.4587525 1 0.5302072
#> 8 67 67 0.4587525 2 0.4697928
#> haplofreq.lower haplofreq.upper reference RRcm.est. RRcm.lower RRcm.upper
#> 1 0.4700483 0.5670651 ref 1.0000000 1.0000000 1.000000
#> 2 0.4329349 0.5299517 - 0.9551130 0.6125346 1.512641
#> 3 0.4078670 0.5757597 ref 1.0000000 1.0000000 1.000000
#> 4 0.4242403 0.5921330 - 1.5606351 0.6593474 3.661163
#> 5 0.4479463 0.6150836 ref 1.0000000 1.0000000 1.000000
#> 6 0.3849164 0.5520537 - 0.8267190 0.3738303 1.870074
#> 7 0.4480859 0.6138420 ref 1.0000000 1.0000000 1.000000
#> 8 0.3861580 0.5519141 - 0.7716568 0.3761164 1.625104
#> RRcm.p.value RRcf.est. RRcf.lower RRcf.upper RRcf.p.value RRcm_RRcf.est.
#> 1 NA 1.0000000 1.0000000 1.000000 NA 1.000000
#> 2 0.8578440 0.7964699 0.4988981 1.294462 0.3508241 1.202628
#> 3 NA 1.0000000 1.0000000 1.000000 NA 1.000000
#> 4 0.3100549 1.4407954 0.6019219 3.440700 0.4240687 1.080438
#> 5 NA 1.0000000 1.0000000 1.000000 NA 1.000000
#> 6 0.6510574 0.7508834 0.3337436 1.712289 0.5004946 1.100715
#> 7 NA 1.0000000 1.0000000 1.000000 NA 1.000000
#> 8 0.4954824 0.5381458 0.2415788 1.232018 0.1409641 1.432857
#> RRcm_RRcf.lower RRcm_RRcf.upper RRcm_RRcf.p.value RRdd.est. RRdd.lower
#> 1 1.0000000 1.000000 NA 1.0000000 1.0000000
#> 2 0.7367299 1.980414 0.4609985 1.1497242 0.6795373
#> 3 1.0000000 1.000000 NA 1.0000000 1.0000000
#> 4 0.4823310 2.468996 0.8399792 1.8820947 0.6829190
#> 5 1.0000000 1.000000 NA 1.0000000 1.0000000
#> 6 0.4520536 2.759066 0.8271603 1.1445511 0.4635546
#> 7 1.0000000 1.000000 NA 1.0000000 1.0000000
#> 8 0.6120656 3.383351 0.4057060 0.7890665 0.3166753
#> RRdd.upper RRdd.p.value
#> 1 1.000000 NA
#> 2 2.012892 0.6007082
#> 3 1.000000 NA
#> 4 4.944720 0.2225700
#> 5 1.000000 NA
#> 6 2.997610 0.7579938
#> 7 1.000000 NA
#> 8 2.051910 0.6357477
plot(poo_x_me_results)
gxe(poo_x_me_results)
#> gxe.test chisq df pval
#> 1 haplo.freq 0.5026134 2 0.7777838
#> 2 poo 0.2626489 2 0.8769332
#> 3 haplo.freq.trend 0.3731476 1 0.5412935
#> 4 poo.trend 0.2175134 1 0.6409412