Reading DNA methylation data
Julia Romanowska
2024-01-17
Source:vignettes/Read_data.Rmd
Read_data.Rmd
Introduction
This vignette gives an overview of HaplinMethyl
functions to read DNA methylation data to memory.
Read data from a file
Let’s use the exemplary data file:
library(HaplinMethyl)
#> Loading required package: Haplin
ex_path <- system.file("extdata", package = "HaplinMethyl")
ex_file <- "env_data_test.dat"
ex_out_file <- "dnam_ex"
To read directly from a file, use envDataRead
function:
dnam_ex <- envDataRead(
file.in = ex_file,
dir.in = ex_path,
file.out = ex_out_file,
sep = " ", # the exemplary file is a space-delimited file
header = TRUE, # make sure to check this!
rownames = TRUE, # make sure to check this!
overwrite = TRUE
)
#> The output file(s) exist!
#> Reading the data in chunks...
#> -- chunk 1--
#> -- chunk 2--
#> ... done reading.
#> Preparing data...
#> ... done preparing
#> Saving data...
#> ... saved to file: ./dnam_ex_env.ffData
Now, in your directory, you have new files:
dir(pattern = ex_out_file)
#> [1] "dnam_ex_3cpg_cat_gen.ffData" "dnam_ex_3cpg_cat_gen.RData"
#> [3] "dnam_ex_3cpgs_env.ffData" "dnam_ex_3cpgs_env.RData"
#> [5] "dnam_ex_env.ffData" "dnam_ex_env.RData"
If you want to use this data in future analyses, don’t delete those files!
Read data from a matrix
It is also possible to create this object from a matrix that is already in the memory.
dnam_ex_from_matrix <- envDataReadFromObj(dnam_matrix)
Loading data
Reading in our exemplary dataset did not take long time as this data is small. However, with real-world data, this might take even up to an hour. Thus, you don’t want to do it every time you start up a new analysis!
If you close your current R-session or delete the DNA methylation
object, you can easily re-load it by using envDataLoad
function and the new files created by envDataRead
.
dnam_ex <- envDataLoad(ex_out_file)
NOTE: loading the data from .ffData file takes seconds instead of tens of minutes! :)
All of the mentioned functions give an object of class
env.data
and either env.cont
for continuous
measurements or env.cat
for categorical data.
class(dnam_ex)
#> [1] "env.cont" "env.data"
Look at data
DNA methylation data is usually stored as a huge matrix with CpGs in columns and samples in rows, or vice-versa. Each cell of the matrix stores a \(\beta\) value, which tells us the rate of methylation of the CpG in the sample (a number between 0 and 1). It might be difficult to print the entire dataset, so we’ve created some help functions.
dnam_ex
#> This is continuous environmental data read in by 'envDataRead'
#> with 400 columns
#> and 200 rows.
summary(dnam_ex)
#> List of 5
#> $ class : chr [1:2] "env.cont" "env.data"
#> $ nrow : int 200
#> $ ncol : int 400
#> $ rownames: chr [1:200] "id1" "id2" "id3" "id4" ...
#> $ colnames: chr [1:400] "cg1" "cg2" "cg3" "cg4" ...
If you want the complete vector of row names and/or column names, use
short = FALSE
argument of the summary
function:
summary(dnam_ex, short = FALSE)
#> $class
#> [1] "env.cont" "env.data"
#>
#> $nrow
#> [1] 200
#>
#> $ncol
#> [1] 400
#>
#> $rownames
#> [1] "id1" "id2" "id3" "id4" "id5" "id6" "id7" "id8" "id9"
#> [10] "id10" "id11" "id12" "id13" "id14" "id15" "id16" "id17" "id18"
#> [19] "id19" "id20" "id21" "id22" "id23" "id24" "id25" "id26" "id27"
#> [28] "id28" "id29" "id30" "id31" "id32" "id33" "id34" "id35" "id36"
#> [37] "id37" "id38" "id39" "id40" "id41" "id42" "id43" "id44" "id45"
#> [46] "id46" "id47" "id48" "id49" "id50" "id51" "id52" "id53" "id54"
#> [55] "id55" "id56" "id57" "id58" "id59" "id60" "id61" "id62" "id63"
#> [64] "id64" "id65" "id66" "id67" "id68" "id69" "id70" "id71" "id72"
#> [73] "id73" "id74" "id75" "id76" "id77" "id78" "id79" "id80" "id81"
#> [82] "id82" "id83" "id84" "id85" "id86" "id87" "id88" "id89" "id90"
#> [91] "id91" "id92" "id93" "id94" "id95" "id96" "id97" "id98" "id99"
#> [100] "id100" "id101" "id102" "id103" "id104" "id105" "id106" "id107" "id108"
#> [109] "id109" "id110" "id111" "id112" "id113" "id114" "id115" "id116" "id117"
#> [118] "id118" "id119" "id120" "id121" "id122" "id123" "id124" "id125" "id126"
#> [127] "id127" "id128" "id129" "id130" "id131" "id132" "id133" "id134" "id135"
#> [136] "id136" "id137" "id138" "id139" "id140" "id141" "id142" "id143" "id144"
#> [145] "id145" "id146" "id147" "id148" "id149" "id150" "id151" "id152" "id153"
#> [154] "id154" "id155" "id156" "id157" "id158" "id159" "id160" "id161" "id162"
#> [163] "id163" "id164" "id165" "id166" "id167" "id168" "id169" "id170" "id171"
#> [172] "id172" "id173" "id174" "id175" "id176" "id177" "id178" "id179" "id180"
#> [181] "id181" "id182" "id183" "id184" "id185" "id186" "id187" "id188" "id189"
#> [190] "id190" "id191" "id192" "id193" "id194" "id195" "id196" "id197" "id198"
#> [199] "id199" "id200"
#>
#> $colnames
#> [1] "cg1" "cg2" "cg3" "cg4" "cg5" "cg6" "cg7" "cg8" "cg9"
#> [10] "cg10" "cg11" "cg12" "cg13" "cg14" "cg15" "cg16" "cg17" "cg18"
#> [19] "cg19" "cg20" "cg21" "cg22" "cg23" "cg24" "cg25" "cg26" "cg27"
#> [28] "cg28" "cg29" "cg30" "cg31" "cg32" "cg33" "cg34" "cg35" "cg36"
#> [37] "cg37" "cg38" "cg39" "cg40" "cg41" "cg42" "cg43" "cg44" "cg45"
#> [46] "cg46" "cg47" "cg48" "cg49" "cg50" "cg51" "cg52" "cg53" "cg54"
#> [55] "cg55" "cg56" "cg57" "cg58" "cg59" "cg60" "cg61" "cg62" "cg63"
#> [64] "cg64" "cg65" "cg66" "cg67" "cg68" "cg69" "cg70" "cg71" "cg72"
#> [73] "cg73" "cg74" "cg75" "cg76" "cg77" "cg78" "cg79" "cg80" "cg81"
#> [82] "cg82" "cg83" "cg84" "cg85" "cg86" "cg87" "cg88" "cg89" "cg90"
#> [91] "cg91" "cg92" "cg93" "cg94" "cg95" "cg96" "cg97" "cg98" "cg99"
#> [100] "cg100" "cg101" "cg102" "cg103" "cg104" "cg105" "cg106" "cg107" "cg108"
#> [109] "cg109" "cg110" "cg111" "cg112" "cg113" "cg114" "cg115" "cg116" "cg117"
#> [118] "cg118" "cg119" "cg120" "cg121" "cg122" "cg123" "cg124" "cg125" "cg126"
#> [127] "cg127" "cg128" "cg129" "cg130" "cg131" "cg132" "cg133" "cg134" "cg135"
#> [136] "cg136" "cg137" "cg138" "cg139" "cg140" "cg141" "cg142" "cg143" "cg144"
#> [145] "cg145" "cg146" "cg147" "cg148" "cg149" "cg150" "cg151" "cg152" "cg153"
#> [154] "cg154" "cg155" "cg156" "cg157" "cg158" "cg159" "cg160" "cg161" "cg162"
#> [163] "cg163" "cg164" "cg165" "cg166" "cg167" "cg168" "cg169" "cg170" "cg171"
#> [172] "cg172" "cg173" "cg174" "cg175" "cg176" "cg177" "cg178" "cg179" "cg180"
#> [181] "cg181" "cg182" "cg183" "cg184" "cg185" "cg186" "cg187" "cg188" "cg189"
#> [190] "cg190" "cg191" "cg192" "cg193" "cg194" "cg195" "cg196" "cg197" "cg198"
#> [199] "cg199" "cg200" "cg201" "cg202" "cg203" "cg204" "cg205" "cg206" "cg207"
#> [208] "cg208" "cg209" "cg210" "cg211" "cg212" "cg213" "cg214" "cg215" "cg216"
#> [217] "cg217" "cg218" "cg219" "cg220" "cg221" "cg222" "cg223" "cg224" "cg225"
#> [226] "cg226" "cg227" "cg228" "cg229" "cg230" "cg231" "cg232" "cg233" "cg234"
#> [235] "cg235" "cg236" "cg237" "cg238" "cg239" "cg240" "cg241" "cg242" "cg243"
#> [244] "cg244" "cg245" "cg246" "cg247" "cg248" "cg249" "cg250" "cg251" "cg252"
#> [253] "cg253" "cg254" "cg255" "cg256" "cg257" "cg258" "cg259" "cg260" "cg261"
#> [262] "cg262" "cg263" "cg264" "cg265" "cg266" "cg267" "cg268" "cg269" "cg270"
#> [271] "cg271" "cg272" "cg273" "cg274" "cg275" "cg276" "cg277" "cg278" "cg279"
#> [280] "cg280" "cg281" "cg282" "cg283" "cg284" "cg285" "cg286" "cg287" "cg288"
#> [289] "cg289" "cg290" "cg291" "cg292" "cg293" "cg294" "cg295" "cg296" "cg297"
#> [298] "cg298" "cg299" "cg300" "cg301" "cg302" "cg303" "cg304" "cg305" "cg306"
#> [307] "cg307" "cg308" "cg309" "cg310" "cg311" "cg312" "cg313" "cg314" "cg315"
#> [316] "cg316" "cg317" "cg318" "cg319" "cg320" "cg321" "cg322" "cg323" "cg324"
#> [325] "cg325" "cg326" "cg327" "cg328" "cg329" "cg330" "cg331" "cg332" "cg333"
#> [334] "cg334" "cg335" "cg336" "cg337" "cg338" "cg339" "cg340" "cg341" "cg342"
#> [343] "cg343" "cg344" "cg345" "cg346" "cg347" "cg348" "cg349" "cg350" "cg351"
#> [352] "cg352" "cg353" "cg354" "cg355" "cg356" "cg357" "cg358" "cg359" "cg360"
#> [361] "cg361" "cg362" "cg363" "cg364" "cg365" "cg366" "cg367" "cg368" "cg369"
#> [370] "cg370" "cg371" "cg372" "cg373" "cg374" "cg375" "cg376" "cg377" "cg378"
#> [379] "cg379" "cg380" "cg381" "cg382" "cg383" "cg384" "cg385" "cg386" "cg387"
#> [388] "cg388" "cg389" "cg390" "cg391" "cg392" "cg393" "cg394" "cg395" "cg396"
#> [397] "cg397" "cg398" "cg399" "cg400"
You can also get quickly number of rows and columns: