Package translate :: Package storage :: Module html
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2004-2006,2008 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from htmlentitydefs import name2codepoint 
 27  import HTMLParser 
 28   
 29  from translate.storage import base 
 30  from translate.storage.base import ParseError 
 31   
 32  # Override the piclose tag from simple > to ?> otherwise we consume HTML 
 33  # within the processing instructions 
 34  HTMLParser.piclose = re.compile('\?>') 
 35   
 36   
 37  strip_html_re = re.compile(r''' 
 38  (?s)^       # We allow newlines, and match start of line 
 39  <(?P<tag>[^\s?>]+)  # Match start of tag and the first character (not ? or >) 
 40  (?: 
 41    (?: 
 42      [^>]    # Anything that's not a > is valid tag material 
 43        | 
 44      (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid 
 45    )*        # Repeat over valid tag material 
 46    [^?>]     # If we have > 1 char, the last char can't be ? or > 
 47  )?          # The repeated chars are optional, so that <a>, <p> work 
 48  >           # Match ending > of opening tag 
 49   
 50  (.*)        # Match actual contents of tag 
 51   
 52  </(?P=tag)>   # Match ending tag; can't end with ?> and must be >=1 char 
 53  $           # Match end of line 
 54  ''', re.VERBOSE) 
 55   
 56   
57 -def strip_html(text):
58 """Strip unnecessary html from the text. 59 60 HTML tags are deemed unnecessary if it fully encloses the translatable 61 text, eg. '<a href="index.html">Home Page</a>'. 62 63 HTML tags that occurs within the normal flow of text will not be removed, 64 eg. 'This is a link to the <a href="index.html">Home Page</a>.' 65 """ 66 text = text.strip() 67 68 # If all that is left is PHP, return "" 69 result = re.findall('(?s)^<\?.*?\?>$', text) 70 if len(result) == 1: 71 return "" 72 73 result = strip_html_re.findall(text) 74 if len(result) == 1: 75 text = strip_html(result[0][1]) 76 return text
77 78 79 normalize_re = re.compile("\s\s+") 80 81
82 -def normalize_html(text):
83 """Remove double spaces from HTML snippets""" 84 return normalize_re.sub(" ", text)
85 86
87 -def safe_escape(html):
88 """Escape &, < and >""" 89 # FIXME we need to relook at these. Escaping to cleanup htmlentity codes 90 # is important but we can't mix "<code>&lt;". In these cases we should 91 # then abort the escaping 92 return re.sub("&(?![a-zA-Z0-9]+;)", "&amp;", html)
93 94
95 -class htmlunit(base.TranslationUnit):
96 """A unit of translatable/localisable HTML content""" 97
98 - def __init__(self, source=None):
99 self.locations = [] 100 self.setsource(source)
101
102 - def getsource(self):
103 #TODO: Rethink how clever we should try to be with html entities. 104 text = self._text.replace("&amp;", "&") 105 text = text.replace("\r\n", " ").replace("\n", " ").replace("\r", " ") 106 return text
107
108 - def setsource(self, source):
109 self._rich_source = None 110 self._text = safe_escape(source)
111 source = property(getsource, setsource) 112
113 - def addlocation(self, location):
114 self.locations.append(location)
115
116 - def getlocations(self):
117 return self.locations
118 119
120 -class htmlfile(HTMLParser.HTMLParser, base.TranslationStore):
121 UnitClass = htmlunit 122 123 MARKINGTAGS = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", 124 "td", "div", "li", "dt", "dd", "address", "caption", "pre"] 125 """Text in these tags that will be extracted from the HTML document""" 126 127 MARKINGATTRS = [] 128 """Text from tags with these attributes will be extracted from the HTML 129 document""" 130 131 INCLUDEATTRS = ["alt", "summary", "standby", "abbr", "content"] 132 """Text from these attributes are extracted""" 133 134 SELF_CLOSING_TAGS = [u"area", u"base", u"basefont", u"br", u"col", 135 u"frame", u"hr", u"img", u"input", u"link", u"meta", 136 u"param"] 137 """HTML self-closing tags. Tags that should be specified as <img /> but 138 might be <img>. 139 U{Reference<http://learnwebsitemaking.com/htmlselfclosingtags.html>}""" 140
141 - def __init__(self, includeuntaggeddata=None, inputfile=None, 142 callback=None):
143 self.units = [] 144 self.filename = getattr(inputfile, 'name', None) 145 self.currentblock = u"" 146 self.currentcomment = u"" 147 self.currenttag = None 148 self.currentpos = -1 149 self.tag_path = [] 150 self.filesrc = u"" 151 self.currentsrc = u"" 152 self.pidict = {} 153 if callback is None: 154 self.callback = self._simple_callback 155 else: 156 self.callback = callback 157 self.includeuntaggeddata = includeuntaggeddata 158 HTMLParser.HTMLParser.__init__(self) 159 160 if inputfile is not None: 161 htmlsrc = inputfile.read() 162 inputfile.close() 163 self.parse(htmlsrc)
164
165 - def _simple_callback(self, string):
166 return string
167 168 ENCODING_RE = re.compile('''<meta.* 169 content.*=.*?charset.*?=\s*? 170 ([^\s]*) 171 \s*?["']\s*?> 172 ''', re.VERBOSE | re.IGNORECASE) 173
174 - def guess_encoding(self, htmlsrc):
175 """Returns the encoding of the html text. 176 177 We look for 'charset=' within a meta tag to do this. 178 """ 179 180 result = self.ENCODING_RE.findall(htmlsrc) 181 encoding = None 182 if result: 183 encoding = result[0] 184 return encoding
185
186 - def do_encoding(self, htmlsrc):
187 """Return the html text properly encoded based on a charset.""" 188 charset = self.guess_encoding(htmlsrc) 189 if charset: 190 return htmlsrc.decode(charset) 191 else: 192 return htmlsrc.decode('utf-8')
193
194 - def pi_escape(self, text):
195 """Replaces all instances of process instruction with placeholders, 196 and returns the new text and a dictionary of tags. The current 197 implementation replaces <?foo?> with <?md5(foo)?>. The hash => code 198 conversions are stored in self.pidict for later use in restoring the 199 real PHP. 200 201 The purpose of this is to remove all potential "tag-like" code from 202 inside PHP. The hash looks nothing like an HTML tag, but the following 203 PHP:: 204 $a < $b ? $c : ($d > $e ? $f : $g) 205 looks like it contains an HTML tag:: 206 < $b ? $c : ($d > 207 to nearly any regex. Hence, we replace all contents of PHP with simple 208 strings to help our regexes out. 209 210 """ 211 result = re.findall('(?s)<\?(.*?)\?>', text) 212 for pi in result: 213 pi_escaped = pi.replace("<", "%lt;").replace(">", "%gt;") 214 self.pidict[pi_escaped] = pi 215 text = text.replace(pi, pi_escaped) 216 return text
217
218 - def pi_unescape(self, text):
219 """Replaces the PHP placeholders in text with the real code""" 220 for pi_escaped, pi in self.pidict.items(): 221 text = text.replace(pi_escaped, pi) 222 return text
223
224 - def parse(self, htmlsrc):
225 htmlsrc = self.do_encoding(htmlsrc) 226 htmlsrc = self.pi_escape(htmlsrc) #Clear out the PHP before parsing 227 self.feed(htmlsrc)
228
229 - def addhtmlblock(self, text):
230 text = strip_html(text) 231 text = self.pi_unescape(text) #Before adding anything, restore PHP 232 text = normalize_html(text) 233 if self.has_translatable_content(text): 234 unit = self.addsourceunit(text) 235 unit.addlocation("%s+%s:%d" % 236 (self.filename, ".".join(self.tag_path), 237 self.currentpos)) 238 unit.addnote(self.currentcomment)
239
240 - def has_translatable_content(self, text):
241 """Check if the supplied HTML snippet has any content that needs to be 242 translated.""" 243 244 text = text.strip() 245 result = re.findall('(?i).*(charset.*=.*)', text) 246 if len(result) == 1: 247 return False 248 249 # TODO: Get a better way to find untranslatable entities. 250 if text == '&nbsp;': 251 return False 252 253 pattern = '<\?.*?\?>' # Lazily strip all PHP 254 result = re.sub(pattern, '', text).strip() 255 pattern = '<[^>]*>' #Strip all HTML tags 256 result = re.sub(pattern, '', result).strip() 257 if result: 258 return True 259 else: 260 return False
261
262 - def buildtag(self, tag, attrs=None, startend=False):
263 """Create an HTML tag""" 264 selfclosing = u"" 265 if startend: 266 selfclosing = u" /" 267 if attrs != [] and attrs is not None: 268 return u"<%(tag)s %(attrs)s%(selfclosing)s>" % \ 269 {"tag": tag, 270 "attrs": " ".join(['%s="%s"' % pair for pair in attrs]), 271 "selfclosing": selfclosing} 272 else: 273 return u"<%(tag)s%(selfclosing)s>" % {"tag": tag, 274 "selfclosing": selfclosing}
275 276 #From here on below, follows the methods of the HTMLParser 277
278 - def startblock(self, tag, attrs=None):
279 self.addhtmlblock(self.currentblock) 280 if self.callback(normalize_html(strip_html(self.currentsrc))): 281 self.filesrc += self.currentsrc.replace(strip_html(self.currentsrc), 282 self.callback(normalize_html(strip_html(self.currentsrc)).replace("\n", " "))) 283 else: 284 self.filesrc += self.currentsrc 285 self.currentblock = "" 286 self.currentcomment = "" 287 self.currenttag = tag 288 self.currentpos = self.getpos()[0] 289 self.currentsrc = self.buildtag(tag, attrs)
290
291 - def endblock(self):
292 self.addhtmlblock(self.currentblock) 293 if self.callback(normalize_html(strip_html(self.currentsrc))) is not None: 294 self.filesrc += self.currentsrc.replace(strip_html(self.currentsrc), 295 self.callback(normalize_html(strip_html(self.currentsrc).replace("\n", " ")))) 296 else: 297 self.filesrc += self.currentsrc 298 self.currentblock = "" 299 self.currentcomment = "" 300 self.currenttag = None 301 self.currentpos = -1 302 self.currentsrc = ""
303
304 - def handle_starttag(self, tag, attrs):
305 newblock = False 306 if self.tag_path != [] \ 307 and self.tag_path[-1:][0] in self.SELF_CLOSING_TAGS: 308 self.tag_path.pop() 309 self.tag_path.append(tag) 310 if tag in self.MARKINGTAGS: 311 newblock = True 312 for i, attr in enumerate(attrs): 313 attrname, attrvalue = attr 314 if attrname in self.MARKINGATTRS: 315 newblock = True 316 if attrname in self.INCLUDEATTRS and self.currentblock == "": 317 self.addhtmlblock(attrvalue) 318 attrs[i] = (attrname, 319 self.callback(normalize_html(attrvalue).replace("\n", " "))) 320 321 if newblock: 322 self.startblock(tag, attrs) 323 elif self.currenttag is not None: 324 self.currentblock += self.get_starttag_text() 325 self.currentsrc += self.get_starttag_text() 326 else: 327 self.filesrc += self.buildtag(tag, attrs)
328
329 - def handle_startendtag(self, tag, attrs):
330 for i, attr in enumerate(attrs): 331 attrname, attrvalue = attr 332 if attrname in self.INCLUDEATTRS and self.currentblock == "": 333 self.addhtmlblock(attrvalue) 334 attrs[i] = (attrname, 335 self.callback(normalize_html(attrvalue).replace("\n", " "))) 336 if self.currenttag is not None: 337 self.currentblock += self.get_starttag_text() 338 self.currentsrc += self.get_starttag_text() 339 else: 340 self.filesrc += self.buildtag(tag, attrs, startend=True)
341
342 - def handle_endtag(self, tag):
343 if tag == self.currenttag: 344 self.currentsrc += "</%(tag)s>" % {"tag": tag} 345 self.endblock() 346 elif self.currenttag is not None: 347 self.currentblock += '</%s>' % tag 348 self.currentsrc += '</%s>' % tag 349 else: 350 self.filesrc += '</%s>' % tag 351 try: 352 popped = self.tag_path.pop() 353 except IndexError: 354 raise ParseError("Mismatched tags: no more tags: line %s" % 355 self.getpos()[0]) 356 while popped in self.SELF_CLOSING_TAGS: 357 popped = self.tag_path.pop() 358 if popped != tag: 359 raise ParseError("Mismatched closing tag: " 360 "expected '%s' got '%s' at line %s" % 361 (popped, tag, self.getpos()[0]))
362
363 - def handle_data(self, data):
364 if self.currenttag is not None: 365 self.currentblock += data 366 self.currentsrc += self.callback(data) 367 elif self.includeuntaggeddata: 368 self.startblock(None) 369 self.currentblock += data 370 self.currentsrc += data 371 else: 372 self.filesrc += self.callback(data)
373
374 - def handle_charref(self, name):
375 """Handle entries in the form &#NNNN; e.g. &#8417;""" 376 self.handle_data(unichr(int(name)))
377
378 - def handle_entityref(self, name):
379 """Handle named entities of the form &aaaa; e.g. &rsquo;""" 380 if name in ['gt', 'lt', 'amp']: 381 self.handle_data("&%s;" % name) 382 else: 383 self.handle_data(unichr(name2codepoint.get(name, u"&%s;" % name)))
384
385 - def handle_comment(self, data):
386 # we can place comments above the msgid as translator comments! 387 if self.currentcomment == "": 388 self.currentcomment = data 389 else: 390 self.currentcomment += u'\n' + data 391 self.filesrc += "<!--%s-->" % data
392
393 - def handle_pi(self, data):
394 self.handle_data("<?%s?>" % self.pi_unescape(data))
395 396
397 -class POHTMLParser(htmlfile):
398 pass
399