2015-12-30 50 views
2

我需要按照4列(6-7和10-11)的顺序交换在塔4(等位基因)的变化的顺序交换,其中:调理列变化与其它四列

A/C should be C/A 
G/C should be C/G 
A/T should be T/A 
G/A should be A/G 
G/T should be T/G 
T/C should be C/T 

当第4列更改时,必须交换各行上的列5:6(Major_Allele_Frequency Minor_Allele_Frequency)和10:11(X.HomA:X.HomB)的顺序。 实施例:

library(data.table) 
data <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB 
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00 
chr1 2 chr1-2 A/C 23 0.58696 0.41304 9 39.13 35.73 25.14 
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88 
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00 
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40 
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00 
chr1 7 chr1-7 G/C 23 0.97727 0.02273 1 4.35 93.48 2.17 
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00 
chr1 9 chr1-9 T/C 23 0.88636 0.11364 5 21.74 69.37 8.89 
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96 
chr1 11 chr1-11 A/T 23 0.52174 0.47826 12 52.17 24.95 22.87 
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91 
chr1 13 chr1-13 G/A 23 1 0 0 0.00 100.00 0.00 
chr1 14 chr1-14 G/T 23 0.475 0.525 17 73.91 12.39 13.70 
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39 
chr2 2 chr2-2 C 23 100 0 0 0 100 0" 
data <- read.table(text=data, header=T) 


Expected outcome: 
expected <- "chr start tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB 
chr1 1 chr1-1 A 23 1 0 0 0.00 100.00 0.00 
chr1 2 chr1-2 C/A 23 0.41304 0.58696 9 39.13 25.14 35.73 
chr1 3 chr1-3 C/A 23 0.88636 0.11364 3 13.04 77.07 9.88 
chr1 4 chr1-4 C/G 23 1 0 0 0.00 100.00 0.00 
chr1 5 chr1-5 C/T 23 0.52174 0.47826 18 78.26 11.34 10.40 
chr1 6 chr1-6 G 23 1 0 0 0.00 100.00 0.00 
chr1 7 chr1-7 C/G 23 0.02273 0.97727 1 4.35 2.17 93.48 
chr1 8 chr1-8 T 23 1 0 0 0.00 100.00 0.00 
chr1 9 chr1-9 C/T 23 0.11364 0.88636 5 21.74 8.89 69.37 
chr1 10 chr1-10 A/G 23 0.5 0.5 6 26.09 36.96 36.96 
chr1 11 chr1-11 T/A 23 0.47826 0.52174 12 52.17 22.87 24.95 
chr1 12 chr1-12 T/A 23 0.80435 0.19565 9 39.13 48.96 11.91 
chr1 13 chr1-13 A/G 23 1 0 0 0.00 100.00 0.00 
chr1 14 chr1-14 T/G 23 0.525 0.475 17 73.91 13.70 12.39 
chr2 1 chr2-1 T/G 23 0.525 0.475 17 73.91 13.70 12.39 
chr2 2 chr2-2 C 23 100 0 0 0 100 0" 
expected <- read.table(text=expected, header=T) 
+0

我不明白为什么列事项的顺序。你的意思是说你想要列5和列6中的_data_,例如_swap_? –

回答

3

作为等位基因的字符编码显然在错误的顺序,可以在alleles柱拆分值,颠倒顺序,并将它们与在指定的strReverse功能一起粘贴回?strsplit,和反向所提到的列与:

# string reverse function from '?strsplit' 
strReverse <- function(x) sapply(lapply(strsplit(x, NULL), rev), paste, collapse = "") 

library(data.table) 
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"), 
      `:=` (alleles = strReverse(as.character(alleles)), 
       Major_Allele_Frequency = Minor_Allele_Frequency, 
       Minor_Allele_Frequency = Major_Allele_Frequency, 
       X.HomA = X.HomB, 
       X.HomB = X.HomA)] 

或可选地,可以使用stri_reverse函数从stringi包:

library(stringi) 
library(data.table) 
setDT(dat)[alleles %in% c("A/C","G/C","A/T","G/A","G/T","T/C"), 
      `:=` (alleles = stri_reverse(alleles), 
       Major_Allele_Frequency = Minor_Allele_Frequency, 
       Minor_Allele_Frequency = Major_Allele_Frequency, 
       X.HomA = X.HomB, 
       X.HomB = X.HomA)] 

这既给:

> dat 
    chr start  tag alleles Number_of_Taxa Major_Allele_Frequency Minor_Allele_Frequency Number_Heterozygous X.Heterozygous X.HomA X.HomB 
1: chr1  1 chr1-1  A    23    1.00000    0.00000     0   0.00 100.00 0.00 
2: chr1  2 chr1-2  C/A    23    0.41304    0.58696     9   39.13 25.14 35.73 
3: chr1  3 chr1-3  C/A    23    0.88636    0.11364     3   13.04 77.07 9.88 
4: chr1  4 chr1-4  C/G    23    1.00000    0.00000     0   0.00 100.00 0.00 
5: chr1  5 chr1-5  C/T    23    0.52174    0.47826     18   78.26 11.34 10.40 
6: chr1  6 chr1-6  G    23    1.00000    0.00000     0   0.00 100.00 0.00 
7: chr1  7 chr1-7  C/G    23    0.02273    0.97727     1   4.35 2.17 93.48 
8: chr1  8 chr1-8  T    23    1.00000    0.00000     0   0.00 100.00 0.00 
9: chr1  9 chr1-9  C/T    23    0.11364    0.88636     5   21.74 8.89 69.37 
10: chr1 10 chr1-10  A/G    23    0.50000    0.50000     6   26.09 36.96 36.96 
11: chr1 11 chr1-11  T/A    23    0.47826    0.52174     12   52.17 22.87 24.95 
12: chr1 12 chr1-12  T/A    23    0.80435    0.19565     9   39.13 48.96 11.91 
13: chr1 13 chr1-13  A/G    23    0.00000    1.00000     0   0.00 0.00 100.00 
14: chr1 14 chr1-14  T/G    23    0.52500    0.47500     17   73.91 13.70 12.39 
15: chr2  1 chr2-1  T/G    23    0.52500    0.47500     17   73.91 13.70 12.39 
16: chr2  2 chr2-2  C    23    100.00000    0.00000     0   0.00 100.00 0.00 

PS:这是最好不要说出你的数据集data,所以我用dat作为名字

+0

我只是忘了提及: T/C也应该是C/T。 只需在Jaap建议的命令中添加“T/C”即可: setDT(dat)[等位基因%%c(“A/C”,“G/C”,“A/T”,“G/A“,”G/T“,”T/C“), – Fpertille

+0

@Fileille thanx,更新了答案 – Jaap

0

使用基础R,如果我承担等位基因的任何变化都是交换,我不再进一步检查:

swapped <- data 
alleles <- as.character(expected$alleles) # or other vector, since I think expected won't exist yet 
changes <- which(as.character(data$alleles) != alleles) 
swapped[changes, c(6, 7, 10, 11)] <- data[changes, c(7, 6, 11, 10)] # this is the swap 
swapped$alleles[changes] <- alleles 
1

今天再次老式答案。我已经改变了变量名DATA1

for (i in 1:nrow(data1)) 
{ 
    if (data1$alleles[i]=="A/C") 
    { 
    data1$alleles[i]='C/A' 
    temp<-data1$Major_Allele_Frequency[i] 
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i] 
    data1$Minor_Allele_Frequency[i]<-temp 
    temp<-data1$X.HomA[i] 
    data1$X.HomA[i]<-data1$X.HomB[i] 
    data1$X.HomB[i]<-temp 
    } 

    if (data1$alleles[i]=="G/C") 
    { 
    data1$alleles[i]='C/G' 
    temp<-data1$Major_Allele_Frequency[i] 
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i] 
    data1$Minor_Allele_Frequency[i]<-temp 
    temp<-data1$X.HomA[i] 
    data1$X.HomA[i]<-data1$X.HomB[i] 
    data1$X.HomB[i]<-temp 
    } 

    if (data1$alleles[i]=="A/T") 
    { 
    data1$alleles[i]='T/A' 
    temp<-data1$Major_Allele_Frequency[i] 
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i] 
    data1$Minor_Allele_Frequency[i]<-temp 
    temp<-data1$X.HomA[i] 
    data1$X.HomA[i]<-data1$X.HomB[i] 
    data1$X.HomB[i]<-temp 
    } 

    if (data1$alleles[i]=="G/A") 
    { 
    data1$alleles[i]='A/G' 
    temp<-data1$Major_Allele_Frequency[i] 
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i] 
    data1$Minor_Allele_Frequency[i]<-temp 
    temp<-data1$X.HomA[i] 
    data1$X.HomA[i]<-data1$X.HomB[i] 
    data1$X.HomB[i]<-temp 
    } 
    if (data1$alleles[i]=="G/T") 
    { 
    data1$alleles[i]='T/G' 
    temp<-data1$Major_Allele_Frequency[i] 
    data1$Major_Allele_Frequency[i]<-data1$Minor_Allele_Frequency[i] 
    data1$Minor_Allele_Frequency[i]<-temp 
    temp<-data1$X.HomA[i] 
    data1$X.HomA[i]<-data1$X.HomB[i] 
    data1$X.HomB[i]<-temp 
    } 

} 
2

这是很简单的:

data <- as.data.table(data) 
tab <- data.table(original = c('A/C', 'G/C', 'A/T', 'G/A', 'G/T'), change = c('C/A', 'C/G', 'T/A', 'A/G', 'T/G')) 

for(i in 1:tab[, .N]) { 
    data[alleles == tab[i, original], c('alleles', 'Major_Allele_Frequency', 'Minor_Allele_Frequency', 'X.HomA', 'X.HomB') := 
     list(tab[i, change], Minor_Allele_Frequency, Major_Allele_Frequency, X.HomB, X.HomA)] 
} 
+0

我只是忘了提及:T/C也应该是C/T。只需在@ danas.zuokas建议的行中分别添加'C/T'和'T/C',如下所示: tab < - data.table(original = c('A/C','G/C' ,'A/T','G/A','G/T','T/C'),change = c('C/A','C/G','T/A','A/G','T/G','C/T')) 在Jaap建议的命令中,像这样:setDT(dat)[等位基因%%c(“A/C”,“G/C” ,“A/T”,“G/A”,“G/T”,“T/C”), – Fpertille