使用者:STcatBot/2.0
STcatBot 2.0 源代碼:
#!/usr/bin/perl
# STcatBot2.0.pl - Simplified and Traditional CATegorization roBOT
# By WikiPedia:User:下一次登录
# Portions largely taken or based on upload.pl by WikiPedia:User:Eloquence
# and mwpush.pl by WikiPedia:User:KeithTyler
# Tested on WindowsXP/Cygwin/ActivePerl
# Corresponding robot: User:STcatBot (application in progress)
# Disclaimer: No warranty ganranteed. Use at your own risk.
# call requirements
use Getopt::Std;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTTP::Cookies;
#use warnings;
my $username="STcatBot"; #input your username here, only English names are tested.
my $password="****"; #input your password here
my $WIKI_PATH="zh.wikipedia.org";
my $WIKI_PAGE;
### Login to wiki
# Set up connection data
my $browser=LWP::UserAgent->new();
my @ns_headers = (
'User-Agent' => 'STcatBot 2.0 by 下一次登录', #Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
# Hold cookies
$browser->cookie_jar( {} );
# Make login request
$response=$browser->post("http://".$WIKI_PATH."/w/index.php?title=Special:Userlogin&action=submitlogin",
@ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginAttempt=>"Log in"]);
# After logging in, we should be redirected to another page.
# If we aren't, something is wrong.
if($response->code!=302) { #cannot login
print
"We weren't able to login. This could have the following causes:
* The username ($username) or password may be incorrect.
Solution: Re-run script with correct credentials.
* The MediaWiki software on the target host has been upgraded.
Solution: Go to http://commons.wikimedia.org/wiki/Commons:File_upload_service
and get a new version of the upload script.
* You are trying to hack this script for other wikis. The wiki you
are uploading to has cookie check disabled.
Solution: Try setting \$ignore_login_error to 1.
Regardless, we will now try to write the output from the server to
rfget.debug.out....\n\n";
open(DEBUG,">rfget.debug.out") or die "Could not write file.\n";
print DEBUG $response->as_string;
print
"This seems to have worked. Take a look at the file for further information or
send it to moeller AT scireview DOT de if you need help debugging the script.\n";
close(DEBUG);
exit 1;
}
my $URL;
my $filename1; #random page reply
my $filestartstr; #first searching string
my $filestart; #first string position
my $fileendstr; #second searching string
my $fileend; #second string position
my $filename; #file name extracted
my $pagecontent; #target page content
my $redcat; #-1, no red cat; otherwise there is red cat
my @unicat; #unicode catnames
my @oricat; #original catnames
my @tarcat; #found targeted catnames if the cat exists
my $catlinecontent; #one cat line content
my $catcount; #number of red cats
my $probecatcontent; #target cat content
my $emptyprobe="class=\"selected new\"";
my $oricattemp; #temp string
my $editToken; #edit token
my $catfound; #is there any change?
my $stcatfound; #is there any s/t cat?
my $content2; #edit content
my $content1; #reply content
my $special_char; #illegal char
my $contain_char; #is there any?
my $changemade;
$changemade=0;
my $article_count=0; #number of articles in allpages
my @article_name; #the characters of the article names for log
my @article_unicode; #the unicode article names for connection
my $last_string; #the unicode of the last article in the last run (init="%21")
my $article_line; #one article line in allpage content
my $article_ID;
while(1) { #process
#read last_string.txt and start allpages from that article
open FILE, "<last_string.txt";
$last_string="";
while (<FILE>) {
$last_string.=$_;
}
#print $last_string;
#go to allpages and get the contents
$URL="http://".$WIKI_PATH."/wiki/Special:Allpages/".$last_string;
$response=$browser->get($URL, @ns_headers);
$filename1=$response->as_string;
$article_count=0; #reset the article count
if(1) { #truncate the contents
#find the start point and extract the content
$filestartstr="<table style=\"background: inherit;\" border=\"0\" width=\"100%\">";
$filestart=index($filename1, $filestartstr);
$filename1=substr($filename1, $filestart+60);
#find the end point and cut
$fileendstr="<div class=\"printfooter\">";
$fileend=index($filename1, $fileendstr);
$filename1=substr($filename1, 0, $fileend);
}
#find all the article names without redirect
#extract a line (between<td> </td>)and leave rest to
$filestartstr="<td>";
$fileendstr="</td>";
$filestart=index($filename1, $filestartstr)+4;
$fileend=index($filename1, $fileendstr);
$article_line=substr($filename1, $filestart, $fileend-$filestart);
$filename1=substr($filename1, $fileend+5);
while($fileend>0) { #if there is article names in allpage contents
#check if it is a redirect
$filestartstr="<div class=\"allpagesredirect\">";
$filestart=index($article_line, $filestartstr);
if($filestart<0) { #it's not a redirect
#process $article_line
#extract the unicode name
$filestartstr="<a href=\"/wiki/";
$filestart=index($article_line, $filestartstr)+15;
$article_line=substr($article_line, $filestart);
$fileendstr="\"";
$fileend=index($article_line, $fileendstr);
$article_unicode[$article_count]=substr($article_line, 0, $fileend);
$article_line=substr($article_line, $fileend+1);
if(0) { #debug allpage contents
open INPUT, ">>debug4.txt";
print INPUT $article_unicode[$article_count];
print INPUT "\n";
close INPUT;
}
#extract the character name
$filestartstr="title=\"";
$filestart=index($article_line, $filestartstr)+7;
$article_line=substr($article_line, $filestart);
$fileendstr="\"";
$fileend=index($article_line, $fileendstr);
$article_name[$article_count]=substr($article_line, 0, $fileend);
if(0) { #debug allpage contents
open INPUT, ">>debug5.txt";
print INPUT $article_name[$article_count];
print INPUT "\n";
close INPUT;
}
$article_count+=1;
}
#extract a line (between<td> </td>)and leave rest to
$filestartstr="<td>";
$fileendstr="</td>";
$filestart=index($filename1, $filestartstr)+4;
$fileend=index($filename1, $fileendstr);
$article_line=substr($filename1, $filestart, $fileend-$filestart);
$filename1=substr($filename1, $fileend+5);
} #while
$article_ID=0;
while($article_ID<$article_count) { #go through all the pages and process
$catfound=0; #is there any change?
$stcatfound=0; #is there any s/t cat?
#go to the target page
$WIKI_PAGE=$article_unicode[$article_ID];
$URL="http://".$WIKI_PATH."/wiki/".$WIKI_PAGE;
$response=$browser->get($URL, @ns_headers);
$pagecontent=$response->as_string ;
print "\nConnected... ";
#check there is a red category
$filestartstr="<a href=\"/wiki/Special:Categories\" title=\"Special:Categories\">";
$redcat = index($pagecontent, $filestartstr);
$catcount=0;
if($redcat<0) { #if there is no cat at all, print in cat_log.txt
if(0) { #debug catname10.txt
open INPUT, ">>cat_log.txt";
print INPUT "No cat at all.\n\n";
close INPUT;
print "No cat.";
}
}
else { #there is(are) cat(s), search red cat(s)
$redcat+=62;
$pagecontent=substr($pagecontent, $redcat, 10000);
$fileendstr="</div>";
$fileend=index($pagecontent, $fileendstr);
$pagecontent=substr($pagecontent, 0, $fileend-4);
$filestartstr="action=edit";
$redcat=index($pagecontent, $filestartstr);
print "Cat found... ";
}
if($redcat<0) { #if there is no red cat, print in cat_log.txt
if(0) { #debug cat_log.txt
open INPUT, ">>cat_log.txt";
print INPUT "No red cat.\n\n";
close INPUT;
}
print "No redcat.";
}
else
{
if(1) { #record the target URL
open INPUT, ">>cat_log.txt";
print INPUT $URL;
print INPUT "\n";
close INPUT;
}
print "Redcat found... ";
}
while($redcat>=0) { #fount red cat(s)
#extract a cat line in content
$filestartstr="<a href";
$fileendstr="</a></span>";
$filestart=index($pagecontent, $filestartstr);
$fileend=index($pagecontent, $fileendstr);
$catlinecontent=substr($pagecontent, $filestart, $fileend-$filestart);
$pagecontent=substr($pagecontent, $fileend+14, 10000);
#is the cat red?
$filestartstr="action=edit";
if(index($catlinecontent, $filestartstr)>=0) { #if the cat is red...
#extract unicat
$fileendstr="&action=edit";
$filestart=28;
$fileend=index($catlinecontent, $fileendstr);
$unicat[$catcount]=substr($catlinecontent, $filestart, $fileend-$filestart);
#extract oricat
$filestartstr="title=\"Category:";
$filestart=index($catlinecontent, $filestartstr);
$oricattemp=substr($catlinecontent, $filestart+16, 1000);
$oricat[$catcount]=substr($oricattemp, 0, length($oricattemp)/2-1);
#does it have a simp/trad corresponding cat?
$URL="http://".$WIKI_PATH."/w/index.php?title=".$unicat[$catcount]."&action=edit";
$response=$browser->get($URL, @ns_headers);
$probecatcontent=$response->as_string ;
if(index($probecatcontent, $emptyprobe)<0) { #if there is a corresponding cat...
#extract tarcat
$filestartstr="<title>";
$filestart=index($probecatcontent, $filestartstr);
$filestart+=28;
$probecatcontent=substr($probecatcontent, $filestart, 1000);
$fileendstr=" - Wikipedia</title>";
$fileend=index($probecatcontent, $fileendstr);
$tarcat[$catcount]=substr($probecatcontent, 0, $fileend);
print "s/t ";
$stcatfound=1;
}
else {
$tarcat[$catcount]=-1;
print "n/e ";
}
#cound the red cats
$catcount+=1;
}
$filestartstr="action=edit";
$redcat=index($pagecontent, $filestartstr);
}
if($catcount>0) { #if change needed, process the content
if(1) { #debug cat_log.txt
open INPUT, ">>cat_log.txt";
print INPUT "Found ";
print INPUT $catcount;
print INPUT " red cat(s).\n";
close INPUT;
}
$URL="http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=edit";
$response=$browser->get($URL, @ns_headers);
$content1=$response->as_string;
# Get EditToken
($editToken) = ( $content1 =~ m/value\=\"([0-9a-f\\]*)\" name\=\"wpEditToken\"/ );
($editTime) = ( $content1 =~ m/value\=\"([0-9a-f]*)\" name\=\"wpEdittime\"/ );
$filestartstr="<textarea tabindex='1' accesskey=\",\" name=\"wpTextbox1\" id=\"wpTextbox1\" rows='25'";
$fileendstr="</textarea>";
$filestart= index($content1, $filestartstr);
$filestart+=92;
$fileend= index($content1, $fileendstr);
$content2=substr($content1, $filestart, $fileend-$filestart);
#substitute
my $i=0;
while($i<$catcount) {
if($tarcat[$i]>=0) {
my $oricatname1="[category:".$oricat[$i];
my $oricatname2="[Category:".$oricat[$i];
my $tarcatname="[Category:".$tarcat[$i];
while(index($content2, $oricatname1)>=0) {
substr($content2, index($content2, $oricatname1), length($oricatname1) ) =$tarcatname;
$catfound=1;
}
while(index($content2, $oricatname2)>=0) {
substr($content2, index($content2, $oricatname2), length($oricatname2) ) =$tarcatname;
$catfound=1;
}
}
$i+=1;
}
}
#check for illegal characters
$contain_char=-1;
$special_char="""; #"
while(index($content2, $special_char)>=0) {
substr($content2, index($content2, $special_char), length($special_char) ) ="\"";
}
$special_char="<"; #<
while(index($content2, $special_char)>=0) {
substr($content2, index($content2, $special_char), length($special_char) ) ="<";
}
$special_char=">"; #>
while(index($content2, $special_char)>=0) {
substr($content2, index($content2, $special_char), length($special_char) ) =">";
}
$special_char="&"; #&
while(index($content2, $special_char)>=0) {
substr($content2, index($content2, $special_char), length($special_char) ) ="&";
}
if($catfound==1) { #if there is changes to be made
print "s/t cat found... ";
{ #upload the new content
print "Updating... ";
if(1) { #debug cat_log.txt
open INPUT, ">>cat_log.txt";
print INPUT "Change made\n\n";
close INPUT;
}
$response=$browser ->
post("http://".$WIKI_PATH."/w/index.php?title=".$WIKI_PAGE."&action=submit",
@ns_headers,
Content_Type=>'form-data',Content=>
[ wpTextbox1 => $content2,
wpSummary => "[[User:STcatBot|STcatBot]]: simp/trad catnames",
wpSave => "Save page",
wpSection => "",
wpEdittime => $editTime,
wpEditToken => $editToken,
wpMinoredit => "1",
]);
$changemade+=1;
print "Change made. Sleep.";
sleep 1;
}
}
else { #if cannot make changes
if($stcatfound==1) {
print "No substritute found.";
if(1) { #debug ns_log.txt
open INPUT, ">>ns_log.txt";
print INPUT "#[[";
print INPUT $article_name[$article_ID];
print INPUT "]]\n";
close INPUT;
}
}
}
$article_ID+=1;
} #while ID<count
if(1) { #record last string.txt
open INPUT, ">last_string.txt";
print INPUT $article_unicode[$article_count-1];
close INPUT;
}
} #while whole