משתמש:Erel Segal/סקריפט להעלאת דפים


#
# u2w = Upload To Wikitext
#
# SYNTAX:
#	perl u2w.pl <input-file>
#

my $in_file = shift || "";

my $summary = "עדכון אוטומטי";
my $username = "<username>";
my $password = "<password>";



# INPUT: the current page content.
# OUTPUT: the new page content.
# NOTE: To use this function, create a file with entries such as:
#    %%%%% title1
#    %%%%% title2
#    %%%%% title3
# etc...
sub edit_function {
	my $content = shift;
	my $cat0, my $cat1;

	$cat0 = "תוכחה, ביקורת";
	$cat1 = "ביקורת ותוכחה";
	$content =~ s!קטגוריה\s*:\s*$cat0!קטגוריה:$cat1!g;

	$content =~ s!({{כותרת עליונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg;
	$content =~ s!({{כותרת תחתונה תנך ללא ניקוד\|.*?}})!<noinclude>$1</noinclude>!mg;

	#$content =~ s!^({{קיצור דרך[^{}]+}})$!<noinclude>$1</noinclude>!mg;
	$content =~ s!(</?noinclude>)</?noinclude>!$1!g;
	return $content;
}

use htmlspecialchars;
use Hebrew_utf8;
use TNK_utf8;

package main;

use LWP::Simple;

use strict;
use warnings;


use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTTP::Cookies;
# use WWW::Mechanize;  doesn't work: HTTP/1.0 403 Forbidden

use IO::Handle;

$main::TARGET_URL = "http://he.wikisource.org/w";

#print `iconv -f windows-1255 -t utf-8 $in_file > $in_file.utf8`; $in_file = "$in_file.utf8";

$main::REWRITE_EXISTING_PAGES = (0 || $in_file =~ /existing/);

(my $file_for_pages_that_already_exist = $in_file) =~ s/[.]txt/.existing.txt/;
(my $file_for_pages_whose_upload_failed = $in_file) =~ s/[.]txt/.failed.txt/;
(my $file_for_debug = $in_file) =~ s/[.]txt/.debug.txt/;

my $count_pages = 0;
my $browser=LWP::UserAgent->new(); # WWW::Mechanize->new();

my @ns_headers = (
   'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7) Gecko/20041107 Firefox/1.0',
   'Accept' => 'image/gif, image/x-xbitmap, image/jpeg,
        image/pjpeg, image/png, */*',
   'Accept-Charset' => 'iso-8859-1,*,utf-8',
   'Accept-Language' => 'en-US',
);

$browser->cookie_jar( {} );
push @{$browser->requests_redirectable}, 'POST';


my $response;
$response=$browser->post("$main::TARGET_URL/index.php?title=%D7%9E%D7%99%D7%95%D7%97%D7%93:Userlogin&action=submitlogin&type=login&returnto=%D7%A2%D7%9E%D7%95%D7%93_%D7%A8%D7%90%D7%A9%D7%99",

@ns_headers, Content=>[wpName=>$username,wpPassword=>$password,wpRemember=>"1",wpLoginattempt=>"%D7%9B%D7%A0%D7%99%D7%A1%D7%94+%D7%9C%D7%97%D7%A9%D7%91%D7%95%D7%9F"]);


open (DEBUG,">$file_for_debug") or die "Could not write to debug file $file_for_debug.\n";


############# upload: #############
my @responses;



open(EXISTING_FILE, ">$file_for_pages_that_already_exist") unless($main::REWRITE_EXISTING_PAGES);
autoflush EXISTING_FILE 1;

open(FAILED_FILE, ">$file_for_pages_whose_upload_failed");
autoflush FAILED_FILE 1;



my $wpTextbox1=''; my $name_of_page='';

print "uploading to $main::TARGET_URL\n";



open(IN_F, $in_file) or die "Cannot open $in_file as input\n";
while (<IN_F>) {
	my $line = $_;
	if ($line =~ /^#####(.*)$/) {
		$name_of_page = $1;
		$name_of_page =~ s/^\s*//g;
		$name_of_page =~ s/\s*$//g;
		next;
	} elsif ($line =~ /^%%%%%(.*)/) {
		$name_of_page = $1;
		$name_of_page =~ s/^\s*//g;
		$name_of_page =~ s/\s*$//g;
		print "Changing $name_of_page ";
		upload_file($name_of_page, "%%%%%");
		print DEBUG "changed $name_of_page\n";
		$wpTextbox1=''; $name_of_page='';
		$count_pages++;
		next;
	} elsif ($line =~ /^ENDOFFILE/) {
		#if ($name_of_page=~/^t/) {$wpTextbox1=''; next;};  # TEMPORARY

		# Fix Word "--" char:
		$name_of_page =~ s/–/-/g;
		$wpTextbox1 =~ s/–/-/g;

		# Remove black-listed links:
		$wpTextbox1 =~ s#google[.]com[/]cse([^ ])*#google.com#ig;

		# add navigation bar
		if ($name_of_page =~ /([א-ת ]+ רבה) ([א-ת]+) ([א-ת]+)/) {
			my $book = $1;
			my $chapter = $2;
			my $verse = $3;
			my $sargel = sargel_niwut($book, $chapter, $verse);
			$wpTextbox1 = "$sargel\n$wpTextbox1";
		}

		my $short_name_of_page='';
		if ($name_of_page=~/^ביאור:/) {
			($short_name_of_page = $name_of_page) =~ s/ביאור://;

			# put shorter names as titles in categories:
			$wpTextbox1 =~ s/\[\[קטגוריה:([^\|\[\]]*)\]\]/\[\[קטגוריה:$1|$short_name_of_page\]\]/ig;
		}


		#print "end $name_of_page \n";

		$wpTextbox1 =~ s/\s*(#REDIRECT)/$1/ig;
		$wpTextbox1 =~ s/\s*(#הפניה)/$1/ig;

		$wpTextbox1=~s|\[http://he.judaism.wikia.com/wiki/([a-zA-Z0-9\/_-]*) ([^\]]*)\]|[[$1\|$2]]|g; # fix inner links

		# Only false alarms...
		#if ($wpTextbox1 =~ /ביאור:ביאור/ || $wpTextbox1=~/שיחה:/) {
		#	die("Deprecated content: $wpTextbox1");
		#}

		print "$name_of_page";
		upload_file($name_of_page, $wpTextbox1, "");
		print DEBUG "uploaded $name_of_page\n";
		$count_pages++;

		if ($wpTextbox1 =~ /{{קיצור דרך\|([^{}]+)}}/i && $in_file!~/existing/) {
			my $name_of_shortcut = $1;
			print "$name_of_shortcut";
			(my $redirect_target=$name_of_page) =~ s/^קטגוריה:/:קטגוריה:/;
			upload_file($name_of_shortcut,
				"#REDIRECT [[$redirect_target]]"
				);
			print DEBUG "uploaded shortcut $name_of_page\n";
			$count_pages++;
		}

		$wpTextbox1=''; $name_of_page=''; $short_name_of_page='';
		next;
	}
	$wpTextbox1.=$line;
}

print "Uploaded $count_pages pages. Everything seems to be OK. Log was written to $file_for_debug.";
print " New content of existing pages was written to  $file_for_pages_that_already_exist" unless ($main::REWRITE_EXISTING_PAGES);
print "\n\n";
# print DEBUG "_____________ uploads  _____________\n";
# print DEBUG @responses;

close(DEBUG);
close(EXISTING_FILE);
close(FAILED_FILE);
exit;

########################################################
sub upload_file {
	my ($name_of_page,$wpTextbox1) = @_;

	#my $is_redirect = ($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i);
	#return if (!$is_redirect);  # TEMP

	# Sanity check
	if (!$name_of_page) {
		print "Empty page name!\n";
		return;
	}
	
	my $response=$browser->get("$main::TARGET_URL/index.php?title=$name_of_page&action=edit");
	my $output1 = $response->as_string;

	if ($output1 =~ /badtitle/i) {
		$name_of_page =~ s/(.)/sprintf ("%d ", ord($1))/ge;
		print "Bad title '$name_of_page'\n\n"; 
		return;
	}

	if ($output1 =~ /mw-recreate-deleted-warn/ && !$main::REWRITE_EXISTING_PAGES) {
		print ":  deleted - skipping:\n\n";
		print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
		return;
	}

	$output1 =~ /value="(\d+)" name="wpStarttime"/; my $wpStarttime=$1;
	$output1 =~ /value="(\d+)" name="wpEdittime"/;  my $wpEdittime=$1;
	$output1 =~ /value="(.+?)" name="wpEditToken"/; my $wpEditToken=$1;
	$output1 =~ /name="wpAutoSummary" type="hidden" value="(.+?)"/;  my $wpAutoSummary=$1;

	$output1 =~ /<textarea([^<>]|\r|\n)*>((.|\r|\n)*?)<\/textarea>/m; 	my $wpOldContent=$2;

	if ($wpTextbox1 =~ /%%%%%/) {      # edit
		$wpOldContent = htmlspecialchars_decode($wpOldContent);

		if ($in_file =~ /milon/ && $name_of_page =~ /קטגוריה:([א-ת]+) \(שורש\)/) {
		# העברת התוכן מדפי הקטגוריות לדפי השורשים בויקימילון
			my $jorj = $1;
			my $ot0 = substr($jorj,0,2); 
			#print "jorj=$jorj; ot=$ot0; \n"; die;
			if ($wpOldContent=~/נטיות פעלים/) {
				my $newContent = $wpOldContent;
				$newContent =~ s/קטגוריה:$ot0 \(שורשים\)/קטגוריה:$jorj (שורש)|*/;
				upload_file("$jorj (שורש)", $newContent, "העברה מדף הקטגוריה של השורש");

				$wpTextbox1 = "{{:$jorj (שורש)}}\n<noinclude>[[קטגוריה:$ot0 (שורשים)]]</noinclude>\n";
				$summary = "העברת התוכן לדף השורש";
			} else {
				print "$name_of_page already changed - skipping\n";
				return;
			}
		} elsif ($name_of_page =~ /רבי יונה על/g) {
		# שינוי שם פירושי רבנו יונה
			(my $new_name_of_page = $name_of_page) =~ s/רבי יונה/רבנו יונה/g;
			if ($wpOldContent!~/הפניה/) {
				my $newContent = $wpOldContent;
				$newContent =~ s/רבי יונה/רבנו יונה/g;
				upload_file($new_name_of_page, $newContent, "העברה מהדף [[$name_of_page]]");

				$wpTextbox1 = "#הפניה [[$new_name_of_page]]\n";
				$summary = "העברה לדף [[$new_name_of_page]]";
			} else {
				print "$name_of_page already changed - skipping\n";
				return;
			}
		} else {
			$wpTextbox1 = edit_function($wpOldContent);
	
			#print "Current content: " . substr($wpOldContent,0,1000) . "...\n";
			# print "New content: " . substr($wpTextbox1,0,1000) . "...\n";
	
			# Sanity check
			if (!$wpTextbox1) {
				die "New content is empty - probably a bug in your edit_function!";
			}
		}
	} elsif ($wpTextbox1 =~ /[+][+][+][+][+]\s+(.*)/s) {     # add
		my $contentToAdd = $1;
		if ($contentToAdd =~ /קיצור (.*)/) {
			$contentToAdd = "<noinclude>{{קיצור דרך|$1}}</noinclude>";
			my $name_of_shortcut = $1;
			print " $name_of_shortcut";
			upload_file($name_of_shortcut, 
				"#REDIRECT [[$name_of_page]]"
				);
			print DEBUG "uploaded shortcut $name_of_page\n";
			$count_pages++;
		}
		$wpTextbox1 = htmlspecialchars_decode("{{דרוש שילוב}}\n----\n$contentToAdd\n----\n$wpOldContent");
	} else {     # upload
		my $are_both_redirects =
			($wpOldContent =~/#הפניה/ || $wpOldContent =~/#redirect/i) && 
			($wpTextbox1 =~/#הפניה/ || $wpTextbox1 =~/#redirect/i);
		if ($wpOldContent && $wpOldContent=~/./ && !$main::REWRITE_EXISTING_PAGES && !$are_both_redirects) {
			print ":  already contains data - skipping:\n\n";# \n$wpOldContent\n";
			print EXISTING_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
			return;
		} else {  # usual upload - do some conversions before uploading:
			# convert indirect links to wikisource to direct links
			if ($main::TARGET_URL =~ /wikisource/) {
				$wpTextbox1 =~ s{\[http://he.wikisource.org/wiki/([^ \]]+) ([^\]]+)]}{[[$1|$2]]}ig;
			}
		}
	}

# the upload itself
	$response=$browser->post("$main::TARGET_URL/index.php?title=$name_of_page&action=submit",
	@ns_headers,Content_Type=>'form-data',Content=>
	[
			wpStarttime=>$wpStarttime,
			wpEdittime=>$wpEdittime,
			wpTextbox1=>"$wpTextbox1",
			wpSummary=>$summary,
			wpEditToken=>$wpEditToken,
			wpAutoSummary=>$wpAutoSummary,
			wpWatchthis=>'on'
	]);
	push @responses,$response->as_string;
	if($response->code!=302 && $response->code!=200) {
		print " Upload failed!\n\n"; 

		print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";

		print DEBUG "$name_of_page failed!\n";
	} elsif ($response->content =~ /badtitle/i) {
		print " Bad title!\n\n"; 
	} elsif ($response->content =~ /ספאם/i) {
		print " Spam!\n\n";

		print FAILED_FILE "##### $name_of_page\n$wpTextbox1\nENDOFFILE\n";
		
		print DEBUG "$name_of_page failed - spam!\n";
	} else {
		#print $response->content;
		print " $name_of_page Uploaded successfully.\n";
	}
}

# create a navigation bar
sub sargel_niwut {
	my ($book, $part, $chapter) = @_;
	if ($chapter) {
		my $previous_chapter = ($chapter eq 'א'? "": Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)-1));
		my $next_chapter = Hebrew::number2hebrew(Hebrew::hebrew2number($chapter)+1);
		return 
			"{{" .
			"סרגל ניווט" .
			"|" .
			$book .
			"|" .
			$part .
			"|" .
			$previous_chapter .
			"|" .
			$chapter .
			"|" .
			$next_chapter .
			"}}";
	} else {
		my $previous_part = ($part eq 'א'? "הקדמה": Hebrew::number2hebrew(Hebrew::hebrew2number($part)-1));
		my $next_part = "$part א";
		return 
			"{{" .
			"סרגל ניווט" .
			"|" .
			$book .
			"|" .
			"" .
			"|" .
			$previous_part .
			"|" .
			$part .
			"|" .
			$next_part .
			"}}";
	}
}

__END__