"""
This libory change HTML files from Ben Yehuda Project (https://benyehuda.org/)
to a Wiki code, that can be used in all mediawiki websites, but especially for the hebrew wikisource.
see https://he.wikipedia.org/wiki/%D7%A2%D7%96%D7%A8%D7%94:%D7%AA%D7%97%D7%91%D7%99%D7%A8_%D7%95%D7%99%D7%A7%D7%99 to more information about wiki-code.
"""
import re
import html
file_name = "C:\\Users\\efrmo\\Downloads\\" + "31020.html"
OUTPUT_NAME = "PBYtoWiki_-_Output.txt"
with open(file_name, "r", encoding="utf-8") as file:
text = file.read()
# footnotes
def replace_foonote(match_obj: re.Match):
global text
num = match_obj.group("note_num")
foonote = match_obj.group("foonote")
if "=" in foonote:
text = text.replace(
f'<a href="#fn:{num}" id="fnref:{num}" title="see footnote" class="footnote"><sup>{num}</sup></a>',
"{{הערה|1=" + foonote + "}}",
)
text = text.replace(
f'<a href="#fn:{num}" id="fnref:{num}" title="see footnote" class="footnote"><sup>{num}</sup></a>',
"{{הערה|" + foonote + "}}",
)
re.sub(
r'<li id="fn:(?P<note_num>\d+)">\n<p>(?P<foonote>.*?) <a href="#fnref:(?P=note_num)" title="return to body" class="reversefootnote"> ↩</a></p>\n</li>',
replace_foonote,
text,
flags=re.DOTALL,
)
# headers
text = re.sub(
r"<h(?P<level>[123456]) id=\".*?\">(?P<header>.*?)</h(?P=level)>",
lambda m: "=" * int(m.group("level"))
+ " "
+ m.group("header")
+ " "
+ "=" * int(m.group("level")),
text,
)
# <strong>, <p>, <br>, atc.
text = re.sub(r"<strong>(.*?)</strong>", r"'''\1'''", text, flags=re.DOTALL)
text = re.sub(r"</?p>", "", text)
text = re.sub(r"</?blockquote>", "", text)
text = text.replace("<br />", "\n")
# tables
text = re.sub(r"</?colgroup>\n?", "", text)
text = re.sub(r"<col />\n?", "", text)
text = re.sub(r"</?thead>", "", text)
text = re.sub(r"</?tbody>", "", text)
text = re.sub(r"\n?<table>\n?", r'\n{| class="wikitable"\n', text)
text = re.sub(
r'\n?<tr( class=".*?")??>(.*?)</tr>\n?', r"\n|- \2\n", text, flags=re.DOTALL
)
text = re.sub(r"\n?\t?<th>(.*?)</th>\n?", r"\n! \1", text)
text = re.sub(r"\n?\t?<td>(.*?)</td>\n?", r"\n| \1", text)
text = re.sub(r"\n?</table>\n?", r"\n|}\n", text)
# special characters
text = html.unescape(text)
# Change quotes, so it will be same to the regular quotes
text = text.replace("“", '"')
text = text.replace("„", '"')
text = text.replace("”", '"')
# img
text = re.sub(
r'\n<figure>\n<img src="(.*?)" alt=".*?" />\n\n</figure>',
r"<!-- תמונה חסרה שיש ליבא -->\n\1",
text,
)
with open(OUTPUT_NAME, "w", encoding="utf-8") as file:
file.write(text)