Skip to contents

Introduction

This vignette gives an overview of HaplinMethyl functions to read DNA methylation data to memory.

Read data from a file

Let’s use the exemplary data file:

library(HaplinMethyl)
#> Loading required package: Haplin

ex_path <- system.file("extdata", package = "HaplinMethyl")
ex_file <- "env_data_test.dat"
ex_out_file <- "dnam_ex"

To read directly from a file, use envDataRead function:

dnam_ex <- envDataRead(
  file.in = ex_file,
  dir.in = ex_path,
  file.out = ex_out_file,
  sep = " ", # the exemplary file is a space-delimited file
  header = TRUE, # make sure to check this!
  rownames = TRUE, # make sure to check this!
  overwrite = TRUE
)
#> The output file(s) exist!
#> Reading the data in chunks...
#>  -- chunk 1--
#>  -- chunk 2--
#> ... done reading.
#> Preparing data...
#> ... done preparing
#> Saving data...
#> ... saved to file: ./dnam_ex_env.ffData

Now, in your directory, you have new files:

dir(pattern = ex_out_file)
#> [1] "dnam_ex_3cpg_cat_gen.ffData" "dnam_ex_3cpg_cat_gen.RData" 
#> [3] "dnam_ex_3cpgs_env.ffData"    "dnam_ex_3cpgs_env.RData"    
#> [5] "dnam_ex_env.ffData"          "dnam_ex_env.RData"

If you want to use this data in future analyses, don’t delete those files!

Read data from a matrix

It is also possible to create this object from a matrix that is already in the memory.

dnam_ex_from_matrix <- envDataReadFromObj(dnam_matrix)

Loading data

Reading in our exemplary dataset did not take long time as this data is small. However, with real-world data, this might take even up to an hour. Thus, you don’t want to do it every time you start up a new analysis!

If you close your current R-session or delete the DNA methylation object, you can easily re-load it by using envDataLoad function and the new files created by envDataRead.

dnam_ex <- envDataLoad(ex_out_file)

NOTE: loading the data from .ffData file takes seconds instead of tens of minutes! :)

All of the mentioned functions give an object of class env.data and either env.cont for continuous measurements or env.cat for categorical data.

class(dnam_ex)
#> [1] "env.cont" "env.data"

Look at data

DNA methylation data is usually stored as a huge matrix with CpGs in columns and samples in rows, or vice-versa. Each cell of the matrix stores a \(\beta\) value, which tells us the rate of methylation of the CpG in the sample (a number between 0 and 1). It might be difficult to print the entire dataset, so we’ve created some help functions.

dnam_ex
#> This is continuous environmental data read in by 'envDataRead'
#> with 400 columns
#> and 200 rows.
summary(dnam_ex)
#> List of 5
#>  $ class   : chr [1:2] "env.cont" "env.data"
#>  $ nrow    : int 200
#>  $ ncol    : int 400
#>  $ rownames: chr [1:200] "id1" "id2" "id3" "id4" ...
#>  $ colnames: chr [1:400] "cg1" "cg2" "cg3" "cg4" ...

If you want the complete vector of row names and/or column names, use short = FALSE argument of the summary function:

summary(dnam_ex, short = FALSE)
#> $class
#> [1] "env.cont" "env.data"
#> 
#> $nrow
#> [1] 200
#> 
#> $ncol
#> [1] 400
#> 
#> $rownames
#>   [1] "id1"   "id2"   "id3"   "id4"   "id5"   "id6"   "id7"   "id8"   "id9"  
#>  [10] "id10"  "id11"  "id12"  "id13"  "id14"  "id15"  "id16"  "id17"  "id18" 
#>  [19] "id19"  "id20"  "id21"  "id22"  "id23"  "id24"  "id25"  "id26"  "id27" 
#>  [28] "id28"  "id29"  "id30"  "id31"  "id32"  "id33"  "id34"  "id35"  "id36" 
#>  [37] "id37"  "id38"  "id39"  "id40"  "id41"  "id42"  "id43"  "id44"  "id45" 
#>  [46] "id46"  "id47"  "id48"  "id49"  "id50"  "id51"  "id52"  "id53"  "id54" 
#>  [55] "id55"  "id56"  "id57"  "id58"  "id59"  "id60"  "id61"  "id62"  "id63" 
#>  [64] "id64"  "id65"  "id66"  "id67"  "id68"  "id69"  "id70"  "id71"  "id72" 
#>  [73] "id73"  "id74"  "id75"  "id76"  "id77"  "id78"  "id79"  "id80"  "id81" 
#>  [82] "id82"  "id83"  "id84"  "id85"  "id86"  "id87"  "id88"  "id89"  "id90" 
#>  [91] "id91"  "id92"  "id93"  "id94"  "id95"  "id96"  "id97"  "id98"  "id99" 
#> [100] "id100" "id101" "id102" "id103" "id104" "id105" "id106" "id107" "id108"
#> [109] "id109" "id110" "id111" "id112" "id113" "id114" "id115" "id116" "id117"
#> [118] "id118" "id119" "id120" "id121" "id122" "id123" "id124" "id125" "id126"
#> [127] "id127" "id128" "id129" "id130" "id131" "id132" "id133" "id134" "id135"
#> [136] "id136" "id137" "id138" "id139" "id140" "id141" "id142" "id143" "id144"
#> [145] "id145" "id146" "id147" "id148" "id149" "id150" "id151" "id152" "id153"
#> [154] "id154" "id155" "id156" "id157" "id158" "id159" "id160" "id161" "id162"
#> [163] "id163" "id164" "id165" "id166" "id167" "id168" "id169" "id170" "id171"
#> [172] "id172" "id173" "id174" "id175" "id176" "id177" "id178" "id179" "id180"
#> [181] "id181" "id182" "id183" "id184" "id185" "id186" "id187" "id188" "id189"
#> [190] "id190" "id191" "id192" "id193" "id194" "id195" "id196" "id197" "id198"
#> [199] "id199" "id200"
#> 
#> $colnames
#>   [1] "cg1"   "cg2"   "cg3"   "cg4"   "cg5"   "cg6"   "cg7"   "cg8"   "cg9"  
#>  [10] "cg10"  "cg11"  "cg12"  "cg13"  "cg14"  "cg15"  "cg16"  "cg17"  "cg18" 
#>  [19] "cg19"  "cg20"  "cg21"  "cg22"  "cg23"  "cg24"  "cg25"  "cg26"  "cg27" 
#>  [28] "cg28"  "cg29"  "cg30"  "cg31"  "cg32"  "cg33"  "cg34"  "cg35"  "cg36" 
#>  [37] "cg37"  "cg38"  "cg39"  "cg40"  "cg41"  "cg42"  "cg43"  "cg44"  "cg45" 
#>  [46] "cg46"  "cg47"  "cg48"  "cg49"  "cg50"  "cg51"  "cg52"  "cg53"  "cg54" 
#>  [55] "cg55"  "cg56"  "cg57"  "cg58"  "cg59"  "cg60"  "cg61"  "cg62"  "cg63" 
#>  [64] "cg64"  "cg65"  "cg66"  "cg67"  "cg68"  "cg69"  "cg70"  "cg71"  "cg72" 
#>  [73] "cg73"  "cg74"  "cg75"  "cg76"  "cg77"  "cg78"  "cg79"  "cg80"  "cg81" 
#>  [82] "cg82"  "cg83"  "cg84"  "cg85"  "cg86"  "cg87"  "cg88"  "cg89"  "cg90" 
#>  [91] "cg91"  "cg92"  "cg93"  "cg94"  "cg95"  "cg96"  "cg97"  "cg98"  "cg99" 
#> [100] "cg100" "cg101" "cg102" "cg103" "cg104" "cg105" "cg106" "cg107" "cg108"
#> [109] "cg109" "cg110" "cg111" "cg112" "cg113" "cg114" "cg115" "cg116" "cg117"
#> [118] "cg118" "cg119" "cg120" "cg121" "cg122" "cg123" "cg124" "cg125" "cg126"
#> [127] "cg127" "cg128" "cg129" "cg130" "cg131" "cg132" "cg133" "cg134" "cg135"
#> [136] "cg136" "cg137" "cg138" "cg139" "cg140" "cg141" "cg142" "cg143" "cg144"
#> [145] "cg145" "cg146" "cg147" "cg148" "cg149" "cg150" "cg151" "cg152" "cg153"
#> [154] "cg154" "cg155" "cg156" "cg157" "cg158" "cg159" "cg160" "cg161" "cg162"
#> [163] "cg163" "cg164" "cg165" "cg166" "cg167" "cg168" "cg169" "cg170" "cg171"
#> [172] "cg172" "cg173" "cg174" "cg175" "cg176" "cg177" "cg178" "cg179" "cg180"
#> [181] "cg181" "cg182" "cg183" "cg184" "cg185" "cg186" "cg187" "cg188" "cg189"
#> [190] "cg190" "cg191" "cg192" "cg193" "cg194" "cg195" "cg196" "cg197" "cg198"
#> [199] "cg199" "cg200" "cg201" "cg202" "cg203" "cg204" "cg205" "cg206" "cg207"
#> [208] "cg208" "cg209" "cg210" "cg211" "cg212" "cg213" "cg214" "cg215" "cg216"
#> [217] "cg217" "cg218" "cg219" "cg220" "cg221" "cg222" "cg223" "cg224" "cg225"
#> [226] "cg226" "cg227" "cg228" "cg229" "cg230" "cg231" "cg232" "cg233" "cg234"
#> [235] "cg235" "cg236" "cg237" "cg238" "cg239" "cg240" "cg241" "cg242" "cg243"
#> [244] "cg244" "cg245" "cg246" "cg247" "cg248" "cg249" "cg250" "cg251" "cg252"
#> [253] "cg253" "cg254" "cg255" "cg256" "cg257" "cg258" "cg259" "cg260" "cg261"
#> [262] "cg262" "cg263" "cg264" "cg265" "cg266" "cg267" "cg268" "cg269" "cg270"
#> [271] "cg271" "cg272" "cg273" "cg274" "cg275" "cg276" "cg277" "cg278" "cg279"
#> [280] "cg280" "cg281" "cg282" "cg283" "cg284" "cg285" "cg286" "cg287" "cg288"
#> [289] "cg289" "cg290" "cg291" "cg292" "cg293" "cg294" "cg295" "cg296" "cg297"
#> [298] "cg298" "cg299" "cg300" "cg301" "cg302" "cg303" "cg304" "cg305" "cg306"
#> [307] "cg307" "cg308" "cg309" "cg310" "cg311" "cg312" "cg313" "cg314" "cg315"
#> [316] "cg316" "cg317" "cg318" "cg319" "cg320" "cg321" "cg322" "cg323" "cg324"
#> [325] "cg325" "cg326" "cg327" "cg328" "cg329" "cg330" "cg331" "cg332" "cg333"
#> [334] "cg334" "cg335" "cg336" "cg337" "cg338" "cg339" "cg340" "cg341" "cg342"
#> [343] "cg343" "cg344" "cg345" "cg346" "cg347" "cg348" "cg349" "cg350" "cg351"
#> [352] "cg352" "cg353" "cg354" "cg355" "cg356" "cg357" "cg358" "cg359" "cg360"
#> [361] "cg361" "cg362" "cg363" "cg364" "cg365" "cg366" "cg367" "cg368" "cg369"
#> [370] "cg370" "cg371" "cg372" "cg373" "cg374" "cg375" "cg376" "cg377" "cg378"
#> [379] "cg379" "cg380" "cg381" "cg382" "cg383" "cg384" "cg385" "cg386" "cg387"
#> [388] "cg388" "cg389" "cg390" "cg391" "cg392" "cg393" "cg394" "cg395" "cg396"
#> [397] "cg397" "cg398" "cg399" "cg400"

You can also get quickly number of rows and columns:

nrows(dnam_ex)
#> [1] 200
ncolumns(dnam_ex)
#> [1] 400