这里赋予功能到另一列的新列是我处理的数据的样本数据帧。对于那些熟悉基因数据格式的人来说,它基本上是一个修改后的VCF文件。如果没有,基本上每行都包含变体可能存在的基因组中位置的信息。通过在数据帧
samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"),
Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L,
8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L,
8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L,
3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G",
"T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L,
1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C",
"G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997",
"AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149",
"rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889",
"rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom",
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")
我想要做的是从“信息”列中提取值。但是,此列中包含的信息对于每行都不相同,并不总是以相同的顺序出现。因此,我想使用模式匹配来获取我感兴趣的值。
我写了一个小函数来提取包含在Info列中的各种“超级种群”(例如AMR,AFR,EUR,SAS,EAS)的“等位基因频率”(AF)。
extractAF <- function(pop, vec) {
info <- unlist((strsplit(vec, ";", fixed=TRUE)))
AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
return(AF)
}
此功能需要两个参数:“流行”,这是一个字符串,指定超级人口以提取,并且其目的是把我的数据帧的信息栏的“VEC”。
extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779
extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528
不过,我希望它做的数据帧中的每一行,并创建包含数据的新列:通过一个单一的载体,通过当
功能按预期工作。当我使用dplyr的功能发生变异,我风与相同值的列:
library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))
我看了一个帖子(我似乎现在不能找到,否则我会引用它),上述发生变异一次传递所有行,而不是我需要的逐行。
所以,我想下面的几个其他的方式在此基础上post:
apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))
错误申请(SAMP [ “信息”],1,函数(X)extractAF( “AMR_AF”,X) ): 暗淡(X)必须有一个正长度
samp[, extractAF("AMR_AF", Info), by = .I]
错误[.data.frame
(SAMP,extractAF( “AMR_AF”,信息),通过= .I): 未使用的参数(由= .I)
samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]
Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) :
unused argument (by = 1:nrow(samp))
#
UPDATE
一个包含NA和AF = 0在下面的INFO列附加样本数据集:
结构(列表(CHROM = C( “CHR1”,“CHR1 (“rs6429774”,“rs6429776”,“chr1”,“chr1”,“chr1”, “chr1”),POS = c(16090898L,16091074L,16091583L,16092212L, 16093560L,16093639L) NA, “rs74528955”,“rs904912”,NA),REF = c(“G”,“A”,“T”,“C”,“T”,“C”), ALT = c(“A “,”G“,”A“,”T“,” A“,”T“),QUAL = c(NA,NA,NA,NA,NA, NA),FILTER = c(NA,NA,NA,NA,NA,NA),INFO = = 1606; AF = 0.320687; AN = 5008; NS = 2504; DP = 21565; EAS_AF = 0.1419; AMR_AF = 0.2983; AFR_AF = 0.525; EUR_AF = 0.3509; SAS_AF = 0.2137; AA = G |||; CSQ = A | ENSG00000162458 | ENST00000441801 | Transcript | upstream_gene_variant ||||||| 96 | 1 ||||||; ERB = A || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335“,”AC = 1690; AF = 0。33746; AN = 5008; NS = 2504; DP = 20247; EAS_AF = 0.1498; AMR_AF = 0.3012; AFR_AF = 0.5681; EUR_AF = 0.3549; SAS_AF = 0.227; AA = G |||; CSQ = G | ENSG00000162458 | ENST00000441801 |解说词| 5_prime_UTR_variant | 81 | |||||| 1 | ||||||; ERB = G || proximal_1216 | Regulatory_Feature | proximal_enhancer; FUNSEQ = 0.3335“,NA, ”AC = 8; AF = 0.00159744; AN = 5008 ; NS = 2504; DP = 19197; EAS_AF = 0.0079; AMR_AF = 0; AFR_AF = 0; EUR_AF = 0; SAS_AF = 0; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441801 |文稿| ||| intron_variant ||||| 1 = |||||| GENCODE = ENST00000441801; ERB = T || proximal_1216 | Regulator_Feature | proximal_enhancer; FUNSEQ = 0.3335“,”AC = 3282; AF = 0.655351; AN = 5008; NS = 2504; DP = 14721; EAS_AF = 0.8343; AMR_AF = 0.6916; AFR_AF = 0.4259; EUR_AF = 0.6531; SAS_AF = 0.7577; AA = A |||; CSQ = A | ENSG00000162458 | ENST00000441801 |文稿| intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801; FUNSEQ = 0.1483“, ”AC = 5; AF = 0.000998403; AN = 5008; NS = 2504; DP = 14736; EAS_AF = 0.003; AMR_AF = 0; AFR_AF = 0 ; EUR_AF = 0; SAS_AF = 0.002; AA = C |||; CSQ = T | ENSG00000162458 | ENST00000441 801 | Transcript | intron_variant |||||||| 1 ||||||; GENCODE = ENST00000441801; FUNSEQ = 0.1483“ )),row.names = 14:19,class =”data.frame“,。名= C( “CHROM”, “POS”, “ID”, “REF”, “ALT”, “QUAL”, “FILTER”, “INFO”))
你的应用功能接近'申请(SAMP,1,函数(X)extractAF(“AFR_AF “,x [5]))' –
请正确格式化您的代码 –