#
# u2w = Upload To Wikitext
#
# SYNTAX:
# perl u2w.pl <input-file>
#
my $in_file = shift || "";
my $summary = "עדכון אוטומטי";
my $username = "<username>";
my $password = "<password>";
# INPUT: the current page content.
# OUTPUT: the new page content.
# NOTE: To use this function, create a file with entries such as:
# %%%%% title1
# %%%%% title2
# %%%%% title3
# etc...
sub edit_function {
my $content = shift;
my $cat0, my $cat1;
$cat0 = "תוכחה, ביקורת";
$cat1 = "ביקורת ותוכחה";
$content =~ s!קטגוריה\s*:\s*$cat0!קטגוריה:$cat1!g;
$content =~ s!({{כותרת עליונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg;
$content =~ s!({{כותרת תחתונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg;
#$content =~ s!^({{קיצור דרך[^{}]+}})$!<noinclude>$1</noinclude>!mg;
$content =~ s!(</?noinclude>)</?noinclude>!$1!g;
return $content;
}
use htmlspecialchars;
use Hebrew_utf8;
use TNK_utf8;
package main;
use LWP::Simple;
use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTTP::Cookies;
# use WWW::Mechanize; doesn't work: HTTP/1.0 403 Forbidden
use IO::Handle;
$main::TARGET_URL = "http://he.wikisource.org/w";
#print `iconv -f windows-1255 -t utf-8 $in_file > $in_file.utf8`; $in_file = "$in_file.utf8";
$main::REWRITE_EXISTING_PAGES = (0 || $in_file =~ /existing/);
(my $file_for_pages_that_already_exist = $in_file) =~ s/[.]txt/.existing.txt/;
(my $file_for_pages_whose_upload_failed = $in_file) =~ s/[.]txt/.failed.txt/;
(my $file_for_debug = $in_file) =~ s/[.]txt/.debug.txt/;
my $count_pages = 0;
my $browser=LWP::UserAgent->new(); # WWW::Mechanize->new();
my @ns_headers = (
'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, image/png, */*',
'Accept-Charset' => 'iso-8859-1,*,utf-8',
'Accept-Language' => 'en-US',
);
$browser->cookie_jar( {} );
push @{$browser->requests_redirectable}, 'POST';
my $response;
$response=$browser->post("$main::TARGET_URL/index.php?title=%D7%9E%D7%99%D7%95%D7%97%D7%93:Userlogin&action=submitlogin&type=login&returnto=%D7%A2%D7%9E%D7%95%D7%93_%D7%A8%D7%90%D7%A9%D7%99",
@ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginattempt=>"%D7%9B%D7%A0%D7%99%D7%A1%D7%94+%D7%9C%D7%97%D7%A9%D7%91%D7%95%D7%9F"]);
open (DEBUG,">$file_for_debug") or die "Could not write to debug file $file_for_debug.\n";
############# upload: #############
my @responses;
open(EXISTING_FILE, ">$file_for_pages_that_already_exist") unless($main::REWRITE_EXISTING_PAGES);
autoflush EXISTING_FILE 1;
open(FAILED_FILE, ">$file_for_pages_whose_upload_failed");
autoflush FAILED_FILE 1;
my $wpTextbox1=''; my $name_of_page='';
print "uploading to $main::TARGET_URL\n";
open(IN_F, $in_file) or die "Cannot open $in_file as input\n";
while (<IN_F>) {
my $line = $_;
if ($line =~ /^#####(.*)$/) {
$name_of_page = $1;
$name_of_page =~ s/^\s*//g;
$name_of_page =~ s/\s*$//g;
next;
} elsif ($line =~ /^%%%%%(.*)/) {
$name_of_page = $1;
$name_of_page =~ s/^\s*//g;
$name_of_page =~ s/\s*$//g;
print "Changing $name_of_page ";
upload_file($name_of_page, "%%%%%");
print DEBUG "changed $name_of_page\n";
$wpTextbox1=''; $name_of_page='';
$count_pages++;
next;
} elsif ($line =~ /^ENDOFFILE/) {
#if ($name_of_page=~/^t/) {$wpTextbox1=''; next;}; # TEMPORARY
# Fix Word "--" char:
$name_of_page =~ s//-/g;
$wpTextbox1 =~ s//-/g;
# Remove black-listed links:
$wpTextbox1 =~ s#google[.]com[/]cse([^ ])*#google.com#ig;
# add navigation bar
if ($name_of_page =~ /([א-ת ]+ רבה) ([א-ת]+) ([א-ת]+)/) {
my $book = $1;
my $chapter = $2;
my $verse = $3;
my $sargel = sargel_niwut($book, $chapter, $verse);
$wpTextbox1 = "$sargel\n$wpTextbox1";
}
my $short_name_of_page='';
if ($name_of_page=~/^ביאור:/) {
($short_name_of_page = $name_of_page) =~ s/ביאור://;
# put shorter names as titles in categories:
$wpTextbox1 =~ s/\[\[קטגוריה:([^\|\[\]]*)\]\]/\[\[קטגוריה:$1|$short_name_of_page\]\]/ig;
}
#print "end $name_of_page \n";
$wpTextbox1 =~ s/\s*(#REDIRECT)/$1/ig;
$wpTextbox1 =~ s/\s*(#הפניה)/$1/ig;
$wpTextbox1=~s|\[http://he.judaism.wikia.com/wiki/([a-zA-Z0-9\/_-]*) ([^\]]*)\]|[[$1\|$2]]|g; # fix inner links
# Only false alarms...
#if ($wpTextbox1 =~ /ביאור:ביאור/ || $wpTextbox1=~/שיחה:/) {
# die("Deprecated content: $wpTextbox1");
#}
print "$name_of_page";
upload_file($name_of_page, $wpTextbox1, "");
print DEBUG "uploaded $name_of_page\n";
$count_pages++;
if ($wpTextbox1 =~ /{{קיצור דרך\|([^{}]+)}}/i && $in_file!~/existing/) {
my $name_of_shortcut = $1;
print "$name_of_shortcut";
(my $redirect_target=$name_of_page) =~ s/^קטגוריה:/:קטגוריה:/;
upload_file($name_of_shortcut,
"#REDIRECT [[$redirect_target]]"
);
print DEBUG "uploaded shortcut $name_of_page\n";
$count_pages++;
}
$wpTextbox1=''; $name_of_page=''; $short_name_of_page='';
next;
}
$wpTextbox1.=$line;
}
print "Uploaded $count_pages pages. Everything seems to be OK. Log was written to $file_for_debug.";
print " New content of existing pages was written to $file_for_pages_that_already_exist" unless ($main::REWRITE_EXISTING_PAGES);
print "\n\n";
# print DEBUG "_____________ uploads _____________\n";
# print DEBUG @responses;
close(DEBUG);
close(EXISTING_FILE);
close(FAILED_FILE);
exit;
########################################################
sub upload_file {
my ($name_of_page,$wpTextbox1) = @_;
#my $is_redirect = ($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i);
#return if (!$is_redirect); # TEMP
# Sanity check
if (!$name_of_page) {
print "Empty page name!\n";
return;
}
my $response=$browser->get("$main::TARGET_URL/index.php?title=$name_of_page&action=edit");
my $output1 = $response->as_string;
if ($output1 =~ /badtitle/i) {
$name_of_page =~ s/(.)/sprintf ("%d ", ord($1))/ge;
print "Bad title '$name_of_page'\n\n";
return;
}
if ($output1 =~ /mw-recreate-deleted-warn/ && !$main::REWRITE_EXISTING_PAGES) {
print ": deleted - skipping:\n\n";
print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
return;
}
$output1 =~ /value="(\d+)" name="wpStarttime"/; my $wpStarttime=$1;
$output1 =~ /value="(\d+)" name="wpEdittime"/; my $wpEdittime=$1;
$output1 =~ /value="(.+?)" name="wpEditToken"/; my $wpEditToken=$1;
$output1 =~ /name="wpAutoSummary" type="hidden" value="(.+?)"/; my $wpAutoSummary=$1;
$output1 =~ /<textarea([^<>]|\r|\n)*>((.|\r|\n)*?)<\/textarea>/m; my $wpOldContent=$2;
if ($wpTextbox1 =~ /%%%%%/) { # edit
$wpOldContent = htmlspecialchars_decode($wpOldContent);
if ($in_file =~ /milon/ && $name_of_page =~ /קטגוריה:([א-ת]+) \(שורש\)/) {
# העברת התוכן מדפי הקטגוריות לדפי השורשים בויקימילון
my $jorj = $1;
my $ot0 = substr($jorj,0,2);
#print "jorj=$jorj; ot=$ot0; \n"; die;
if ($wpOldContent=~/נטיות פעלים/) {
my $newContent = $wpOldContent;
$newContent =~ s/קטגוריה:$ot0 \(שורשים\)/קטגוריה:$jorj (שורש)|*/;
upload_file("$jorj (שורש)", $newContent, "העברה מדף הקטגוריה של השורש");
$wpTextbox1 = "{{:$jorj (שורש)}}\n<noinclude>[[קטגוריה:$ot0 (שורשים)]]</noinclude>\n";
$summary = "העברת התוכן לדף השורש";
} else {
print "$name_of_page already changed - skipping\n";
return;
}
} elsif ($name_of_page =~ /רבי יונה על/g) {
# שינוי שם פירושי רבנו יונה
(my $new_name_of_page = $name_of_page) =~ s/רבי יונה/רבנו יונה/g;
if ($wpOldContent!~/הפניה/) {
my $newContent = $wpOldContent;
$newContent =~ s/רבי יונה/רבנו יונה/g;
upload_file($new_name_of_page, $newContent, "העברה מהדף [[$name_of_page]]");
$wpTextbox1 = "#הפניה [[$new_name_of_page]]\n";
$summary = "העברה לדף [[$new_name_of_page]]";
} else {
print "$name_of_page already changed - skipping\n";
return;
}
} else {
$wpTextbox1 = edit_function($wpOldContent);
#print "Current content: " . substr($wpOldContent,0,1000) . "...\n";
# print "New content: " . substr($wpTextbox1,0,1000) . "...\n";
# Sanity check
if (!$wpTextbox1) {
die "New content is empty - probably a bug in your edit_function!";
}
}
} elsif ($wpTextbox1 =~ /[+][+][+][+][+]\s+(.*)/s) { # add
my $contentToAdd = $1;
if ($contentToAdd =~ /קיצור (.*)/) {
$contentToAdd = "<noinclude>{{קיצור דרך|$1}}</noinclude>";
my $name_of_shortcut = $1;
print " $name_of_shortcut";
upload_file($name_of_shortcut,
"#REDIRECT [[$name_of_page]]"
);
print DEBUG "uploaded shortcut $name_of_page\n";
$count_pages++;
}
$wpTextbox1 = htmlspecialchars_decode("{{דרוש שילוב}}\n----\n$contentToAdd\n----\n$wpOldContent");
} else { # upload
my $are_both_redirects =
($wpOldContent =~/#הפניה/ || $wpOldContent =~/#redirect/i) &&
($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i);
if ($wpOldContent && $wpOldContent=~/./ && !$main::REWRITE_EXISTING_PAGES && !$are_both_redirects) {
print ": already contains data - skipping:\n\n";# \n$wpOldContent\n";
print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
return;
} else { # usual upload - do some conversions before uploading:
# convert indirect links to wikisource to direct links
if ($main::TARGET_URL =~ /wikisource/) {
$wpTextbox1 =~ s{\[http://he.wikisource.org/wiki/([^ \]]+) ([^\]]+)]}{[[$1|$2]]}ig;
}
}
}
# the upload itself
$response=$browser->post("$main::TARGET_URL/index.php?title=$name_of_page&action=submit",
@ns_headers,Content_Type=>'form-data',Content=>
[
wpStarttime=>$wpStarttime,
wpEdittime=>$wpEdittime,
wpTextbox1=>"$wpTextbox1",
wpSummary=>$summary,
wpEditToken=>$wpEditToken,
wpAutoSummary=>$wpAutoSummary,
wpWatchthis=>'on'
]);
push @responses,$response->as_string;
if($response->code!=302 && $response->code!=200) {
print " Upload failed!\n\n";
print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
print DEBUG "$name_of_page failed!\n";
} elsif ($response->content =~ /badtitle/i) {
print " Bad title!\n\n";
} elsif ($response->content =~ /ספאם/i) {
print " Spam!\n\n";
print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
print DEBUG "$name_of_page failed - spam!\n";
} else {
#print $response->content;
print " $name_of_page Uploaded successfully.\n";
}
}
# create a navigation bar
sub sargel_niwut {
my ($book, $part, $chapter) = @_;
if ($chapter) {
my $previous_chapter = ($chapter eq 'א'? "": Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)-1));
my $next_chapter = Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)+1);
return
"{{" .
"סרגל ניווט" .
"|" .
$book .
"|" .
$part .
"|" .
$previous_chapter .
"|" .
$chapter .
"|" .
$next_chapter .
"}}";
} else {
my $previous_part = ($part eq 'א'? "הקדמה": Hebrew::number2hebrew(Hebrew::hebrew2number($part)-1));
my $next_part = "$part א";
return
"{{" .
"סרגל ניווט" .
"|" .
$book .
"|" .
"" .
"|" .
$previous_part .
"|" .
$part .
"|" .
$next_part .
"}}";
}
}
__END__