# vim: set fileencoding=utf-8 : ######################################################################## # # PasteHtml.py # ------------ # HTML parser for Wikidpad # # v0.3 by xkjq@ymail.com # # As usual thanks to Michael for all his work on WikidPad and sorting # out a cross platform way to retrieve html text from the clipboard. # # A plugin that retrieves HTML text from the clipboard and converts it # to a wikidpad compatable format. # # The script is customisable so it shouldn't be too hard to modify it # for custom parsers. Beware its still early days and stuff is likely # to change in future versions. # # Note: Formating can only be retrived if it is defined by html tags, # CSS formating will be ignored. # # TODO: Clean up the code and maybe even give the script a nice GUI # for configuration # ######################################################################## # # The following settings define how the text is formated. If the # defaults are not to your liking they can be changed here. # # If you use a custom parser chances are you *will* need to change this. # # # START CONFIGURATION # # Some html tags have no equivelent in the default wikiParser language. # If (like me) you haven't got round to adding them to your parser, but # still want to keep the formating add them below. tags_to_keep = set(["u", "sup", "sub"]) # # tags_to_sub defines which html tags should be converted into wikiparser # format and what their replacements should be # # html_tag : ("wikiParser_start_tag", "wikidParser_end_tag") # # i.e. u"b" : (u"*", u"*") # would cause Hello World! # to become *Hello World!* # tags_to_sub = { u"b" : (u"*", u"*"), u"strong" : (u"*", u"*"), u"i" : (u"_", u"_"), u"em" : (u"_", u"_"), u"p" : (u"\n", u"\n"), u"h1" : (u"\n++", u"\n"), u"h2" : (u"\n+++", u"\n"), u"h3" : (u"\n++++", u"\n"), u"h4" : (u"\n+++++", u"\n"), u"h5" : (u"\n++++++", u"\n"), u"h6" : (u"\n+++++++", u"\n"), u"h7" : (u"\n++++++++", u"\n"), u"h8" : (u"\n+++++++++", u"\n"), u"br" : (u"\n", u""), # Links u"a" : (u"[", u"]"), # TODO: split into class tags # for OHCExam u"HD" : (u"", u"\n"), # Tags that don't exist in HTML u"box" : (u"<>\n"), } # # This defines how tables should be formated. If we are in a table # the script will check for tags in here prior to those defined # above. format_table = { u"table": (u"\n<<|\n", u">>\n"), u"tr": (u"", u"\n"), u"td": (u"|", u""), # th is used in the header row. Currently wikipad, default parser # at least, does not distinguish the table header. u"th": (u"|", u""), u"caption": (u"", u""), # Previously defined tags can be overriden whilst # inside tables. i.e. br should become "\newline" # instead of just "newline" for the default parser u"br" : (u"\\\n", u""), u"p" : (u"", u""), } STRIP_CONTENT_WHITESPACE_FROM_TAGS = (u"li", u"p", u"a", u"caption", u"td", u"tr", u"h1", u"h2", u"h3", u"h4", u"h5", u"table") ADVANCE_TABLE_FORMATING = True # # Setting variable below to True would cause table formating to be # ignored if no table start tag has been encountered. ignore_table_formatting_when_not_in_table = False # If table not open will open table on any tag present in # format_table (and not defined below) open_tables_if_needed = True # will override # ignore_table_formatting_when_not_in_table # Only these tags can start a table automatically table_start_tags = set([u"tr", u"td", u"th", u"caption"]) close_open_tables = True # Useful when only pasting part of a table # Defines what formating from format_table is used when opening a # table. Probably best to leave this as default table_start = u"table" # # If true tags which have no associated text and are not specified # below will be ignored. Can solve some issues but may create others. # i.e.
will be ignored #>")
attempt_to_clean_formatting_tags = True
attempt_to_clean_formatting_tags_across_linebreaks = True
REMOVE_LINE_BREAKS_FROM_PARAGRAPHS = False
ATTEMPT_TO_KEEP_DIV_BOXES = True
BOX_CLASSES = (u"SIDEBAR BOX", )
#
# Custom replaces
# ---------------
# These are custom rules to be run on the text. Can be handy to remove
# unwanted formating (from sites such as wikipedia etc.) automatically.
#
# custom_replace is a standard python replace(a, b) function where all
# occurances of a are replaced by b.
# custom_replace_regex a re.sub.
#
# NOTE: These are run after the html has been parser
enable_custom_replace = True
custom_replace = [
#("[edit]", ""), # Remove some unwanted wikipedia formating
#("[_citation needed_]", ""),
# Hack, should be fixed in parser
("<(\[\d*\])?", r""), # Remove wikipedia refs, e.g. [2]
(r"\+\*_(.*)_\*", r"+\1"), #
(r"\+_\*(.*)\*_", r"+\1"), #
(r"\+\*(.*)\*", r"+\1"), #
(r"(\n *)*\n *\n", r"\n\n"), # Remove excessive line breaks
]
#
#
# END SCRIPT CONFIGURATION
# You shouldn't (!) have to edit anything below here
html_character_entities = [
(u"<", u"<"),
(u">", u">"),
(u"&", u"&"),
(u"¢", u"¢"),
(u"£", u"£"),
(u"¥", u"¥"),
(u"€", u"€"),
(u"§", u"§"),
(u"©", u"©"),
(u"®", u"®"),
(u"™", u"™"),
]
########################################################################
import sys, wx, re
from HTMLParser import HTMLParser
try:
# If runnig from source we need to import as such
from lib.pwiki.wxHelper import getHtmlFromClipboard, getTextFromClipboard
except ImportError:
# This should handle the windows binary case
from pwiki.wxHelper import getHtmlFromClipboard, getTextFromClipboard
WIKIDPAD_PLUGIN = (("MenuFunctions",1),)
def describeMenuItems(wiki):
global nextnumber
return ((Paste, "Paste HTML\tCtrl-Shift-V", "Pastes HTML in a wikipad compatable format"),)
# Custom lists conversion
numeral_map = zip(
(1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1),
('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
)
def intToRoman(i):
"""
Converts integer to roman numeral
"""
result = []
for integer, numeral in numeral_map:
count = int(i / integer)
result.append(numeral * count)
i -= integer * count
return ''.join(result)
letters = "0ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def intToLetter(i):
"""
Converts integer to letter format
Will work up to ZZ (26*26)
"""
if i > 26:
f = letters[i // 26]
s = letters[i % 26]
return "".join([f, s])
else:
return letters[i]
class HTMLStripper(HTMLParser):
"""
Strips all HTML tags other than those defined in
tags_to_keep
"""
def __init__(self):
self.reset()
self.fed = []
self.spacer = spacer
self.tags_to_keep = tags_to_keep
self.tags_to_sub = tags_to_sub
self.lists_format = lists_format
self.list_structure = []
self.list_numbers = []
# Is it realistic to handle tables within tables?
self.in_table = 0
self.table_start_pos = 0 # position in the self.fed list
self.in_paragraph = False
self.span_tags = []
self.div_tags = []
self.new_table_row = True
self.table_cell_appendix = None
self.last_tag = None
self.link_anchor = None
def handle_starttag(self, tag, attrs):
"""
Called whenever an opening tag is reach
Generally we add the wikipad equivelent to the
list self.fed
"""
self.previous_tag = self.last_tag
self.last_tag = tag
if tag == u"p":
if IGNORE_P_IN_LISTS and len(self.list_structure) > 0:
return
self.in_paragraph = True
if tag == u"img":
self.handle_image(attrs)
return
# If we're in a table check table formatting first
if tag in format_table:
if tag == table_start:
self.in_table += 1
self.table_start_pos = len(self.fed)
if tag == u"tr":
self.new_table_row = True
if self.in_table < 1: # Table not started
if open_tables_if_needed and tag in \
table_start_tags: # Start table
self.in_table += 1
self.table_start_pos = len(self.fed)
self.fed.append(format_table[table_start][0])
self.handle_table_tag(tag, attrs)
return
elif ignore_table_formatting_when_not_in_table:
pass
else:
self.handle_table_tag(tag, attrs)
return
# Extract some style from span's
# expand to include divs and others?
# TODO: SHOULD GENERALISE
if tag == u"span":
span_tags_to_add = []
styles_list = self.get_attribute(attrs, "style", False)
if styles_list:
styles_list = styles_list.split(u";")
styles = {}
for style in styles_list:
try:
a, b = style.split(u":")
styles[a.strip()] = b.strip()
except ValueError:
pass
if u"font-style" in styles:
if styles[u"font-style"] == u"italic":
span_tags_to_add.append(u"i")
if u"font-weight" in styles:
try:
if styles[u"font-weight"] == u"bold":
span_tags_to_add.append(u"b")
elif int(styles[u"font-weight"]) > 400:
span_tags_to_add.append(u"b")
except ValueError:
pass
for i in span_tags_to_add:
self.handle_starttag(i, None)
self.span_tags.append(span_tags_to_add)
return
if tag == u"div":
div_tags_to_add = []
# ----------------------------------------------
# Code below is for importing from OHCExam
css_class = self.get_attribute(attrs, "class", False)
if css_class and css_class.startswith("TLV"):
self.fed.append(u"\n\n++{0}".format(int(css_class[3]) * u"+"))
#div_tags_to_add.append(u"h{0}".format(int(css_class[3]) + 1))
#self.handle_starttag(
elif css_class == u"HD":
div_tags_to_add.append(u"HD")
elif css_class == u"P":
if self.previous_tag != u"li":
div_tags_to_add.append(u"HD")
div_tags_to_add.append(u"p")
elif ATTEMPT_TO_KEEP_DIV_BOXES and css_class in BOX_CLASSES:
div_tags_to_add.append(u"box")
# ----------------------------------------------
styles_list = self.get_attribute(attrs, "style", False)
if styles_list:
styles_list = styles_list.split(u";")
styles = {}
for style in styles_list:
try:
a, b = style.split(u":")
styles[a.strip()] = b.strip()
except ValueError:
pass
if u"font-style" in styles:
if styles[u"font-style"] == u"italic":
div_tags_to_add.append(u"i")
if u"font-weight" in styles:
try:
if styles[u"font-weight"] == u"bold":
div_tags_to_add.append(u"b")
elif int(styles[u"font-weight"]) > 400:
div_tags_to_add.append(u"b")
except ValueError:
pass
for i in div_tags_to_add:
self.handle_starttag(i, None)
self.div_tags.append(div_tags_to_add)
return
if tag in self.tags_to_keep:
self.fed.append(u"<{0}>".format(tag))
return
if tag in self.tags_to_sub:
# Parse anchors start tag
if tag == u"a":
if KEEP_ANCHOR_ANCHORS and self.get_attribute(attrs, "name", False):
self.fed.append("anchor: {0}\n".format(self.get_attribute(attrs, "name", False)))
if maintain_links:
link = self.get_attribute(attrs, u"href", u"Unable to find link")
anchor = u""
if use_wikidpad_anchor_format and u"#" in link:
link, anchor = link.split(u"#")
self.link_anchor = "!{0}".format(anchor)
self.fed.append("".join([self.tags_to_sub[tag][0], link, wikipad_link_delimiter]))
else:
# Add blank object so it can be removed if necessary
self.fed.append(u"")
return
self.fed.append(self.tags_to_sub[tag][0])
# Handle lists
if tag in self.lists_format:
list_type = self.get_attribute(attrs, u"type")
self.list_structure.append((self.lists_format[tag], list_type))
self.list_numbers.append(0)
if tag == u"li" and len(self.list_structure) > 0:
list_item_number = self.list_numbers[-1]+1
self.list_numbers[-1] = list_item_number
list_start = self.list_structure[-1][0]
if list_start == u"ordered": # Ordered list
n = list_item_number
if allow_extra_ol_formats:
if self.list_structure[-1][1] == u"A": # Capital letters
n = intToLetter(list_item_number)
if self.list_structure[-1][1] == u"a": # Lower letters
n = intToLetter(list_item_number).lower()
if self.list_structure[-1][1] == u"I": # Roman letters
n = intToRoman(list_item_number)
if self.list_structure[-1][1] == u"i": # Roman lower letters
n = intToRoman(list_item_number).lower()
list_start = u"{0}.".format(n)
# List items must start on a new line (in wikidpad)
if self.fed and not (self.fed[-1].endswith(u"\n") or
self.fed[-1].strip() == u""):
self.fed.append(u"\n")
self.fed.append(u"{0}{1} ".format(self.spacer*len(self.list_structure), list_start))
def handle_startendtag(self, tag, attrs):
"""
Self closing tags are handled here
e.g.
For now just redirects to handle_starttag()
Are their any situations in which this shouldn't
happen?
"""
self.handle_starttag(tag, attrs)
def handle_endtag(self, tag):
"""
Called as the tag is closed
"""
# Strip whitespace from the previous content if required
if tag in STRIP_CONTENT_WHITESPACE_FROM_TAGS:
self.fed[-1] = self.fed[-1].rstrip()
self.last_end_tag = tag
if tag == u"p":
if IGNORE_P_IN_LISTS and len(self.list_structure) > 0:
return
self.in_paragraph = False
if ignore_empty_tags:
if tag in self.tags_to_keep or tag in self.tags_to_sub:
if tag == self.last_tag and tag not in allowed_empty_tags:
del self.fed[-1]
self.last_tag = None
return
if tag == u"span":
tags = self.span_tags.pop()
tags.reverse()
for i in tags:
self.handle_endtag(i)
elif tag == u"div":
tags = self.div_tags.pop()
tags.reverse()
for i in tags:
self.handle_endtag(i)
if tag in format_table:
if self.in_table > 0:
#if tag == u"caption":
# self.fed.append(format_table[table_start][0])
# return
if tag == u"caption":
d = self.end_temp_feed()
self.add_table_arg(self.table_start_pos, u"C")
self.fed.insert(self.table_start_pos+1, d+"\n")
if tag == table_start:
self.in_table -= 1
self.fed.append(format_table[tag][1])
if self.table_cell_appendix is not None:
self.fed.append(self.table_cell_appendix)
self.table_cell_appendix = None
return
if tag == u"a":
if maintain_links:
self.fed.append(self.tags_to_sub[tag][1])
if self.link_anchor is not None:
self.fed.append(self.link_anchor)
self.link_anchor = None
return
if tag in self.tags_to_keep:
self.fed.append(u"{0}>".format(tag))
return
if tag in self.tags_to_sub:
self.fed.append(self.tags_to_sub[tag][1])
#if tag == u"li" and len(self.list_structure) > 0:
# self.fed.append(u"\n")
if tag in self.lists_format:
del self.list_structure[-1]
if not self.list_structure:
self.fed.append(u"\n")
def add_table_arg(self, table_start_pos, arg):
start_block = self.fed[table_start_pos]
if not start_block.startswith(format_table["table"][0]):
return False
to_add = u""
if start_block.endswith("\n"):
start_block = start_block[:-1]
to_add = u"\n"
if self.fed[table_start_pos] == format_table["table"][0]:
self.fed[table_start_pos] = start_block + arg + to_add
else:
self.fed[table_start_pos] = start_block + ";" + arg + to_add
def handle_data(self, d):
try:
if d == u" " and self.fed[-1] == u" ":
return
except IndexError:
pass
# Ignore empty lines within list
if len(self.list_structure) > 0:
if len(d.strip()) < 1:
return
if self.in_paragraph and REMOVE_LINE_BREAKS_FROM_PARAGRAPHS:
d = d.replace(u"\n", u" ")
# Remove style tags?
if self.last_tag == u"style":
self.last_tag = None
return
if self.last_tag in STRIP_CONTENT_WHITESPACE_FROM_TAGS:
d = d.lstrip()
self.last_tag = None
self.fed.append(d)
def get_data(self):
if close_open_tables:
while self.in_table > 0:
self.fed.append(format_table[table_start][1])
self.in_table -= 1
return u''.join(self.fed)
def start_temp_feed(self):
self.fed_bak = self.fed[:]
self.fed = []
def end_temp_feed(self):
data = self.fed[:]
self.fed = self.fed_bak[:]
self.fed_bak = []
return "".join(data)
def handle_table_tag(self, tag, attrs=None):
# Captions need to be inserted before the table start
if ADVANCE_TABLE_FORMATING and tag == u"caption" :
# # Caption should always be immediatly after table start
# # so we can just delete the last item, add the caption
# # and start the table again on its close tag
# del self.fed[-1]
# return
# NOTE: not all captions are defined in the correct position so we have to
# be a bit more careful
self.start_temp_feed()
# Special case for p
if tag == u"p":
if self.last_end_tag == u"p":
self.fed.append(u"\\\n")
# Special case is needed for td
if tag == u"td" or tag == u"th":
if ADVANCE_TABLE_FORMATING:
# Get attributes that will be placed into table appendix
colspan = self.get_attribute(attrs, "colspan", False)
rowspan = self.get_attribute(attrs, "rowspan", False)
# Could also pull some style info here, e.g. text-align
appendix = []
if colspan:
appendix.append('c{0}'.format(colspan))
if rowspan:
appendix.append('r{0}'.format(rowspan))
if appendix:
self.table_cell_appendix = ";{0}".format(";".join(appendix))
if not self.new_table_row:
# need spaces or blank cell will be ignored
self.fed.append(u" {0} ".format(format_table[tag][0]))
return
else:
self.new_table_row = False
return
else:
self.fed.append(format_table[tag][0])
def handle_image(self, attrs):
if add_image_src:
appendix = ""
if maintain_image_resizes:
width = self.get_attribute(attrs, "width", False)
height = self.get_attribute(attrs, "height", False)
# Currently only deals with img for which both width and height are
# specified
if width and height:
size_type = "s" # default size in pixels
# HTML can handle images with 1 dimension in %
# and the other in pixel but wikidpad (default
# parser at least) cannot.
if width[-1] == "%":
size_type = "r"
appendix = "".join([wikipad_url_appendix_delimiter, size_type,
width, "x", height, " "])
self.fed.append("".join([self.get_attribute(attrs, "src"), appendix]))
def get_attribute(self, attrs, attr, not_found=""):
"""
Loops through all attributes returning the requested one
if found.
attrs is a list of turples, quotations (") are not included
[(name, value), (name2, value2), ...]
"""
if len(attrs) > 0:
for name, value in attrs:
if name == attr:
return value
return not_found
def strip_tags(html):
s = HTMLStripper()
s.feed(html)
return s.get_data()
def getData(d):
# Remove html character entities
for a, b in html_character_entities:
d = d.replace(a, b)
if pre_blocks_maintain_formating:
# Pre blocks have to be handled before we remove whitespace
pre_blocks = []
a = re.search(r"| [\s\S]*?>)([\s\S]*?)
| [\s\S]*?>)", d)
while a:
d = d.replace(a.group(0), "$_PREBLOCK-{0}_$".format(len(pre_blocks)))
pre_blocks.append(a.group(1))
a = re.search(r"| [\s\S]*?>)([\s\S]{1,}?)| [\s\S]*?>)", d)
# Remove whitespace (ignored in html anyway)
d = " ".join(d.split())
if attempt_to_clean_formatting_tags:
for i in ("i", "b", "u"):
d = re.sub("[(?<=\w)\n](<{0}>)\n. .[(?=\w)]".format(i), r"\1 ", d)
d = re.sub("[(?<=\w)\n][\n ].(<\/{0}>)[(?=\w)\n]".format(i), r"\1 ", d)
if attempt_to_clean_formatting_tags_across_linebreaks:
for i in ("i", "b", "u"):
d = re.sub("(?<=\w) .\n(<\/{0}>)(?=\w)|\n".format(i), r"\1\n", d)
d = re.sub("(?<=\w)(<{0}>) .\n(?=\w)|\n".format(i), r"\1\n", d)
d = strip_tags(d)
# Clean up some spaces
d = d.replace(" ", " ")
if pre_blocks_maintain_formating:
# Add pre blocks back in
for i in range(len(pre_blocks)):
d = d.replace("$_PREBLOCK-{0}_$".format(i), "{0}{1}{2}".format(pre_block_tags[0], pre_blocks[i], pre_block_tags[1]))
# Perform custom replaces (if enabled)
if enable_custom_replace:
if len(custom_replace) > 0:
for a, b in custom_replace:
d = d.replace(a, b)
if len(custom_replace_regex) > 0:
for a, b in custom_replace_regex:
d = re.sub(a, b, d)
return d.lstrip()
def Paste(pwiki, evt):
editor = pwiki.getActiveEditor()
html = getHtmlFromClipboard()[0]
if html is not None:
text = getData(html)
editor.ReplaceSelection(text)
else:
text = getTextFromClipboard()
editor.ReplaceSelection(text)