1)这是一块蛋糕拿到Referencia
substrRight <- function(x, n){
sapply(x, function(xx)
substr(xx, (nchar(xx)-n+1), nchar(xx)))
}
library(XML)
u<-"http://www.boe.es/buscar/boe.php?campo%5B1%5D=DOC&dato%5B1%5D=edicto+auto+declaracion+concurso+CIF%20&campo%5B6%5D=FPU&dato%5B6%5D%5B0%5D=25%2F04%2F2013&dato%5B6%5D%5B1%5D=30%2F04%2F2013%20&sort_field%5B0%5D=fpu&sort_order%5B0%5D=desc&sort_field%5B1%5D=ref&sort_order%5B1%5D=asc&accion=Buscar" #link
doc1<-htmlParse(u) 'get html'
kbbRoot <- xmlRoot(doc1) #parse it into xml
els<-getNodeSet(kbbRoot,"//*[contains(concat(' ', @class, ' '), concat(' ', 'resultado-busqueda-link-defecto', ' '))]") #get all links by xpath
links<-sapply(els, function(el) xmlGetAttr(el, "href")) #get inner (start with .../)
links<-sapply(links, function(x) substr(x,start=3,stop=nchar(x))) #delete ../
links<-sapply(links, function(x) paste("http://www.boe.es", x,sep=""))#generate correct link
Referencia<-sapply(links, function(x) substrRight(x,16)) # get referencia from links
2)CIF复杂得多。你必须使用正则表达式。不幸的是,我并不擅长。所以请问论坛上的其他人:“应该使用正则表达式来从字符串中获得CIF值?”
CIFRA<-function (u){
doc1<-htmlParse(u)#get html
kbbRoot <- xmlRoot(doc1)# parse it
els<-getNodeSet(kbbRoot,"//*[contains(concat('', @class,''), concat('', 'parrafo', ''))]")#select text
l<-sapply(els, xmlValue) #analyse each sentences
x<-regexpr(pattern="[A-Z][0-9]+",text=l)#Try to find CIF by using RegEXP
#regexp return position in string
ind<-which.max(x) #'number of row with CIF'
st<- x[ind]-3 #start position
en<-st+attr(x, "match.length")[ind]-1 #finish
res<-substring(l[ind],st,en) #select text between start and finish
}
CIF < -sapply(链接功能(X)奇弗拉(X))
退房R中 – 2013-04-25 20:22:59
的XML库也可以添加更多的信息,像一个示例表要到出现在R? – 2013-04-25 20:24:05
@绿色恶魔感谢你的包装。示例表格粘贴在上面和上面的链接上。它只是标有“Datos generales del concurso”的框。 – nopeva 2013-04-26 06:11:21