2015-06-22 68 views
-1

我想创建一个bot来列出我的wiki的页面,然后搜索wiki的所有页面,以便在另一个页面中找到页面的名称时创建超链接。Api mediawiki:一个创建超链接的机器人?

我有一个名为“Wiki”的页面,在另一个页面中有“wiki”这个词。所以我想创建一个超链接来重定向页面“Wiki”。

这是我第一次使用这个API,所以我不知道如何继续。 我已经发现你可以用“list = allpages”列出你所有的页面,并用“list = search”在所有wiki中搜索一个字符串,但是当我有那个字符串的页面的名字时,怎么能我只编辑页面中的那些字符串?

现在我正在用PHP做这件事,所以我可以做点像获取页面的所有内容,改变它然后编辑页面?

+0

我期望这是可能的。我不认为你需要这样的机器人 - 你可以使用MediaWiki搜索工具吗?对于每个新的或编辑的页面,使用脚本搜索标题和超链接解除链接的结果。这就是说,用户手动链接会不会更容易? – halfer

+0

我也认为手动这样做比较容易,但只能在小维基上进行:/当已经有很多页面时,自动更容易执行 – Ise

+0

好的。所以,先算出你的算法。您需要监视页面更改:如果标题发生更改(或因为添加了页面而添加),则您需要创建或中断链接。如果页面更改(使用不同的单词),那么您将需要重新扫描链接。如果你删除一个页面,你会想要删除链接。我会为此考虑一个简单的cron,并在每次调用时检查最后编辑的页面。如果这与上次不同,请运行页面扫描仪。先在纸上画出来? – halfer

回答

0

如果你有兴趣,这里是我做过什么来创建这个机器人:

$path_cookie = "______path________"; 
$botLogin="Bot"; 
$botPass="password"; 
$linkWiki="exemple.com"; 

if (!file_exists($path_cookie)) touch($path_cookie); //create a file to stay logged in 

$curl = curl_init(); 

function requeteCurl($postfields, $curl, $linkWiki, $path_cookie) //the function you'll just use for each of your requests when logged in 
{ 
    curl_setopt($curl, CURLOPT_URL, $linkWiki); 
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 
    curl_setopt($curl, CURLOPT_POST, true); 
    curl_setopt($curl, CURLOPT_POSTFIELDS, $postfields); 
    curl_setopt($curl, CURLOPT_COOKIEFILE, realpath($path_cookie)); 
    $resultat = curl_exec($curl); 

    return $resultat; 
} 

    /* First you need to login with your bot */ 


$postfields = array(
     'action' => 'login', 
     'format'=> 'json', 
     'lgname' => $botLogin, 
     'lgpassword' => $botPass 
); 

curl_setopt($curl, CURLOPT_URL, $linkWiki); 
curl_setopt($curl, CURLOPT_COOKIESESSION, true); 
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 
curl_setopt($curl, CURLOPT_POST, true); 
curl_setopt($curl, CURLOPT_POSTFIELDS, $postfields); 
curl_setopt($curl, CURLOPT_COOKIEJAR, $path_cookie); //you need to stock your cookies the first time 
$connexion=curl_exec($curl); 
if (!$connexion) { 
    throw new Exception("Error getting data from server ($linkWiki: " . curl_error($curl)); 
} 
$json_connexion = json_decode($connexion, true); 
$tokenConnexion=$json_connexion['login']['token']; //sometime you need to login a second time with the token : 
$postfields = array(
     'action' => 'login', 
     'format'=> 'json', 
     'lgtoken' => $tokenConnexion, 
     'lgname' => $botLogin, 
     'lgpassword' => $botPass 

); 

$connexionToken=requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
var_dump($connexionToken); 

    /* You have to list all the pages in your wiki to know which strings to search for */ 

$postfields = array(
      'action' => 'query', 
      'format'=> 'json', 
      'list' => 'allpages', 
      'aplimit' => 'max' 
    ); 
    $pagesWiki=requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
    $json_pagesWikis = json_decode($pagesWiki, true); 
    $tabPagesWiki= array(); 
    $i=0; 
    foreach ($json_pagesWikis["query"]["allpages"] as $pages) { // stock all the names in an array 
     $tabNomsPagesWiki[$i] = $pages["title"]; // tableau contenant les noms sont modif des pages 
     $i++; 
    } 

    /* Then you search on all the wiki to find the pages where the string you search is */ 

foreach ($tabNomsPagesWiki as $chaineRecherchee) //you use each name as a string to search 
{ 
    $postfields = array(
      'action' => 'query', 
      'format'=> 'json', 
      'list' => 'search', 
      'srsearch' => $chaineRecherchee, 
      'srwhat' => 'text', 
      'srlimit' => 'max' 
    ); 

    $pagesString = requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
    $json_pagesString = json_decode($pagesString, true); 
    $pagesComportantLaRecherche= array(); //and you stock again your results in an array 
    $i=0; 
    foreach ($json_pagesString["query"]["search"] as $search) { 
     $pagesComportantLaRecherche[$i] = $search["title"] ; 
     $i++; 
    } 

    /* now you have to find your string in the page */ 

    foreach($pagesComportantLaRecherche as $pageRecherche){ 
    if($pageRecherche != $chaineRecherchee){ //you don't want to do create link to the page in which you are ! 
      $postfields = array(
        'action' => 'parse', 
        'format'=> 'json', 
        'page' => $pageRecherche, 
        'prop' => 'wikitext' 
      ); 
      $pageContent=requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
      $json_pagesContent = json_decode($pageContent, true); 

      $text_pagesContent = $json_pagesContent["parse"]["wikitext"]["*"] ; //now you have all the content of your page in a var 

    /* To find where your string is and replace it with a link you have to first search for the links in the page to not put a link in a link*/ 

      $stringLien = "[[".$chaineRecherchee."]]"; //that's the string which will replace the one in the text 
      $stringLength = strlen($chaineRecherchee); 

      $patternLien = "/((\\[\\[[^\\]]*)[\\s](".$chaineRecherchee.")[\\s\\,\\.][^\\]]*\\]\\])|((\\[[^\\]]*)[\\s\\'](".$chaineRecherchee.")[\\s\\,\\.\\'][^\\]]*\\])/mi"; //a regex to find all the links with your string in it in the page 
      preg_match_all($patternLien, $text_pagesContent, $liens,PREG_OFFSET_CAPTURE); 
      $patternNomPage = "/[\\s\\']".$chaineRecherchee."[\\s\\,\\.\\']/im"; //now to find just your string 
      preg_match_all($patternNomPage, $text_pagesContent, $nomPages,PREG_OFFSET_CAPTURE);   

      $decalage=1; 

      foreach ($nomPages[0] as $page){ 
       // you need to know the offset of all your strings and your links to compare it 
       $offsetNomPagetrouvee = $page[1];  
       $est_dans_lien = false; 
       foreach ($liens[0] as $lien){ 
        $lienOffset= $lien[1];  
        $lienTaille = strlen($lien[0]); 
        if($lienOffset <= $offsetNomPagetrouvee && $offsetNomPagetrouvee <= $lienOffset+ $lienTaille){ 
         $est_dans_lien = true; 
         break; 
        } 
       } 
       if(!$est_dans_lien){ //if you find a string which is not in a link then you replace it with a link 
        $text_pagesContent = substr_replace($text_pagesContent, $stringLien, $offsetNomPagetrouvee+$decalage, $stringLength); 
        $decalage+=4; //you have to move your offset as you change a string by a link so you add four characters :[[]] 
       } 

      } 


      if($decalage>1){ //if you created some new links, then you edit the page 

       $postfields = array(
         'action' => 'query', 
         'meta' => 'tokens', 
         'format' => 'json' 
       ); 
       $tokenEdit=requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
       $json_tokenEdit = json_decode($tokenEdit, true); 
       $text_tokenEdit = $json_tokenEdit['query']['tokens']['csrftoken']; 


       $postfields = array(
         'action' => 'edit', 
         'format' => 'json', 
         'title' => $pageRecherche, 
         'text' => $text_pagesContent, 
         'bot' => '', 
         'token' => $text_tokenEdit 
       ); 
       $edit=requeteCurl($postfields, $curl, $linkWiki, $path_cookie); 
       echo "\n".$edit; 

      } 



     } 

    } 

} 


unlink($path_cookie); 

嗯,我敢肯定有在此代码了很多不必要的东西,但我不是一个亲在PHP和Mediawiki和脚本运行就好,所以它不是那么糟糕,我认为^^