You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1210 lines
39 KiB

  1. #! /usr/bin/env python3
  2. """
  3. mw2html - Mediawiki to static HTML
  4. I use this to create a personal website from a local mediawiki
  5. installation. No search functionality. Hacks the Monobook skin and
  6. the produced HTML.
  7. Connelly Barnes 2005. Public domain.
  8. Reworked by Andre Pinto 2009.
  9. Improved performance.
  10. Improved filtering.
  11. Improved usability.
  12. Customized for Audacity's manual wiki.
  13. Minor tweaks (for Audacity) By James Crook, Nov 2009.
  14. Moved to Python3 by Jack Thomson, May 2020
  15. ...
  16. """
  17. __version__ = '0.1.0.3'
  18. import re
  19. import sys
  20. import getopt
  21. import random
  22. import urllib.request, urllib.parse, urllib.error
  23. import textwrap
  24. import urllib.parse
  25. import os, os.path
  26. import htmldata
  27. import errno
  28. import hashlib
  29. import http.client
  30. from time import strftime
  31. from shutil import copyfile
  32. try:
  33. import htmldata
  34. except:
  35. print('Requires Python3 htmldata module:')
  36. print(' https://github.com/audacity/audacity/blob/master/scripts/mw2html_audacity/htmldata.py')
  37. sys.exit()
  38. config = None
  39. MOVE_HREF = 'movehref'
  40. MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
  41. INDEX_HTML = 'index.html'
  42. QHELP_HTML = 'quick_help.html'
  43. url_filename_cache = {}
  44. redir_cache = {}
  45. wrote_file_set = set()
  46. sidebar_html = ''
  47. footer_text = ''
  48. counter = 0
  49. errors = 0
  50. conn = None
  51. headers = {"User-Agent": "mw2html.py/Audacity"}
  52. domain = ''
  53. MONOBOOK_SKIN = 'monobook' # Constant identifier for Monobook.
  54. class Config:
  55. """
  56. Instances contain all options passed at the command line.
  57. """
  58. def __init__(self, rooturl, outdir,
  59. flatten=True, index=None, clean=True,
  60. sidebar=None, hack_skin=True,
  61. made_by=True, overwrite=False, footer=None,
  62. skin=MONOBOOK_SKIN, move_href=True,
  63. remove_png=True, remove_history=True, limit_parent=False,
  64. special_mode=False, debug=False, no_images=False):
  65. self.rooturl = rooturl
  66. self.outdir = os.path.abspath(outdir)
  67. self.flatten = flatten
  68. self.index = index
  69. self.clean = clean
  70. self.sidebar = sidebar
  71. self.hack_skin = hack_skin
  72. self.made_by = made_by
  73. self.overwrite = overwrite
  74. self.footer = footer
  75. self.skin = skin
  76. self.move_href = move_href
  77. if self.sidebar is not None:
  78. self.sidebar = os.path.abspath(self.sidebar)
  79. if self.footer is not None:
  80. self.footer = os.path.abspath(self.footer)
  81. self.remove_png = remove_png
  82. self.remove_history = remove_history
  83. self.limit_parent = limit_parent
  84. self.special_mode = special_mode
  85. self.debug = debug
  86. self.no_images = no_images
  87. def get_domain(u):
  88. """
  89. Get domain of URL.
  90. """
  91. url = normalize_url(u)
  92. #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
  93. L = list(urllib.parse.urlparse(url))
  94. return L[1]
  95. def normalize_url(url, lower=True):
  96. # url normalization - only for local comparison operations, use original url for online requests
  97. url = split_section(url)[0]
  98. if lower:
  99. url = url.lower()
  100. #if url.startswith('http://'):
  101. # url = url[len('http://'):]
  102. if url.startswith('https://'):
  103. url = url[len('https://'):]
  104. if url.startswith('www.'):
  105. url = url[len('www.'):]
  106. url = url.strip('/')
  107. url = 'https://' + url
  108. urllib.parse.urljoin(config.rooturl, url)
  109. return url
  110. def find_tag_limits(doc, filter_string, end_tag, start_tag, start_point=0):
  111. # find tag limits - start_string must be an unique identifier within doc
  112. i1 = doc.find(filter_string, start_point)
  113. if i1 == -1:
  114. return (-1, -1)
  115. aux = doc.rfind(start_tag, start_point, i1 + len(filter_string))
  116. # we've found the filter_string but it has not the start_tag, so we return a different value
  117. # telling the script to keep searching starting on the end of the filter_string found
  118. if aux == -1:
  119. return (-2, i1 + len(filter_string))
  120. i1 = aux
  121. sdiv = i1
  122. ediv = i1 + len(start_tag)
  123. while(sdiv < ediv and sdiv != -1):
  124. sdiv = doc.find(start_tag, sdiv + len(start_tag))
  125. ediv = doc.find(end_tag , ediv + len(end_tag))
  126. return (i1, ediv)
  127. def clean_tag(doc, filter_string, end_tag, start_tag):
  128. #clean tagged text function
  129. start_point = 0
  130. while True:
  131. (start1, start2) = find_tag_limits(doc, filter_string, end_tag, start_tag, start_point)
  132. if start1 == -1 or start2 == -1:
  133. return doc
  134. if start1 == -2:
  135. start_point = start2
  136. continue
  137. end1 = doc.find('>', start1) + 1;
  138. end2 = start2 + len(end_tag);
  139. doc = doc[:start1] + doc[end1:start2] + doc[end2:]
  140. def remove_tag(doc, start_string, end_tag, start_tag):
  141. #remove tagged text function
  142. while True:
  143. (i1, i2) = find_tag_limits(doc, start_string, end_tag, start_tag)
  144. if i1 == -1 or i2 == -1:
  145. return doc
  146. doc = doc[:i1] + doc[i2 + len(end_tag):]
  147. def monobook_fix_html(doc, page_url):
  148. """
  149. Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output.
  150. """
  151. global config
  152. if config.made_by:
  153. doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
  154. # Obsolete substitutions.
  155. # doc = remove_tag(doc, '<div class="portlet" id="p-editors">', '</div>', '<div')
  156. #James also remove the page/discussion/source/history/ div.
  157. doc = remove_tag(doc, '<li id="ca-', '</li>', '<li')
  158. doc = remove_tag(doc, '<div id="p-search" class="portlet"', '</div>', '<div')
  159. doc = remove_tag(doc, '<div class="portlet" id="p-personal"', '</div>', '<div')
  160. doc = remove_tag(doc, '<div class="editornote2"', '</div>', '<div')
  161. doc = remove_tag(doc, '<div id="p-cactions"', '</div>', '<div')
  162. doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-For_Editors"', '</div>', '<div')
  163. doc = remove_tag(doc, '<div class="generated-sidebar portlet" id="p-ToDo"', '</div>', '<div')
  164. doc = remove_tag(doc, '<div class="portlet" id="p-tb"', '</div>', '<div')
  165. doc = remove_tag(doc, '<div id="catlinks"', '</div>', '<div')
  166. #remove javascript.
  167. doc = remove_tag(doc, '<script', '</script>', '<script')
  168. #andre special mode
  169. if config.special_mode:
  170. # Remove ul list
  171. doc = remove_tag(doc, '<ul id="f-list">', '</ul>', '<ul')
  172. # Remove link rel alternate and edit
  173. doc = re.sub(r'<link rel="alternate"[\s\S]+?/>', r'', doc)
  174. doc = re.sub(r'<link rel="edit"[\s\S]+?/>', r'', doc)
  175. # Remove print footer
  176. doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>', r'', doc)
  177. # Remove noexport
  178. doc = remove_tag(doc, '<div class="noexport"', '</div>', '<div')
  179. doc = remove_tag(doc, '<span class="noexport"', '</span>', '<span')
  180. # Remove editornote
  181. doc = remove_tag(doc, '<div class="editornote"', '</div>', '<div')
  182. else:
  183. # Remove powered by MediaWiki logo
  184. doc = re.sub(
  185. r'<div id="f-poweredbyico">[\s\S]+?(<ul id="f-list">)',
  186. r'\1', doc)
  187. # Remove page has been accessed X times list item.
  188. doc = re.sub(r'<li id="f-viewcount">[\s\S]+?</li>', r'', doc)
  189. # Remove disclaimers list item.
  190. doc = re.sub(r'<li id="f-disclaimer">[\s\S]+?</li>', r'', doc)
  191. # Remove edit links
  192. doc = remove_tag(doc, '<div class="editsection"', '</div>', '<div')
  193. doc = remove_tag(doc, '<span class="editsection"', '</span>', '<span')
  194. doc = re.sub(r'<h2>Navigation menu</h2>', r'', doc)
  195. doc = re.sub(r'Audacity Development Manual</title>', r'Audacity Manual</title>', doc )
  196. doc = re.sub(r' .lpha Manual</strong>', r' Manual</strong>', doc )
  197. return doc
  198. def pre_html_transform(doc, url):
  199. """
  200. User-customizable HTML transform.
  201. Given an HTML document (with URLs already rewritten), returns
  202. modified HTML document.
  203. """
  204. global config
  205. if config.hack_skin:
  206. if config.skin == MONOBOOK_SKIN:
  207. doc = monobook_fix_html(doc, url)
  208. if not config.special_mode:
  209. doc = monobook_hack_skin_html(doc)
  210. else:
  211. raise ValueError('unknown skin')
  212. if config.move_href:
  213. doc = fix_move_href_tags(doc)
  214. if config.remove_history:
  215. doc = html_remove_image_history(doc)
  216. doc = html_remove_translation_links(doc)
  217. return doc
  218. def pos_html_transform(doc, url,filename):
  219. global footer_text, config, sidebar_html
  220. url = normalize_url(url, False)
  221. # Add sidebar.html
  222. if config.sidebar != None and sidebar_html == '':
  223. f = open(config.sidebar, 'r')
  224. sidebar_html = f.read()
  225. f.close()
  226. # doc = re.sub(r'(<!-- end of the left \(by default at least\) column -->)', sidebar_html + r'\1', doc)
  227. # Remove empty links
  228. doc = clean_tag(doc, 'href=""', '</a>', '<a ');
  229. if config.special_mode:
  230. # Remove external link rel stylesheet
  231. doc = re.sub(r'<link rel="stylesheet" href="https://[\s\S]+?/>', r'', doc)
  232. # Remove external javascript
  233. doc = re.sub(r'<script type="text/javascript" src="https://[\s\S]+?</script>', r'', doc)
  234. # Add back relevant stylesheet.
  235. top_level_dir = config.outdir
  236. if (os.path.dirname(os.path.dirname(filename)) == config.outdir):
  237. doc = re.sub(r'</head>',
  238. '<link rel="stylesheet" href="m/skins/monobook/main.css/303.css" media="screen" />\n</head>', doc,
  239. flags=re.DOTALL)
  240. elif (os.path.dirname(os.path.dirname(os.path.dirname(filename))) == config.outdir):
  241. doc = re.sub(r'</head>',
  242. '<link rel="stylesheet" href="../m/skins/monobook/main.css/303.css" media="screen" />\n</head>',
  243. doc,
  244. flags=re.DOTALL)
  245. else:
  246. doc = re.sub(r'</head>',
  247. '<link rel="stylesheet" href="../../m/skins/monobook/main.css/303.css" media="screen" />\n</head>',
  248. doc,
  249. flags=re.DOTALL)
  250. # Replace remaining text with footer, if available (this needs to be done after parse_html to avoid rewriting of urls
  251. if config.footer is not None:
  252. s1 = '<div id="footer"'
  253. # match correct divs
  254. (i1, i2) = find_tag_limits(doc, s1, '</div>', '<div')
  255. if (i1 == -1):
  256. return doc
  257. if footer_text == '':
  258. f = open(config.footer, 'r')
  259. footer_text = f.read()
  260. f.close()
  261. # add static dump time
  262. footer_html = footer_text.replace('%DATE%', strftime("%Y-%m-%d"))
  263. # add online url
  264. footer_html = footer_html.replace('%ONLINEURL%', url)
  265. if config.special_mode:
  266. # keep MediaWiki credits
  267. doc = doc[:i2] + footer_html + doc[i2:]
  268. else:
  269. doc = doc[:i1 + len(s1)] + footer_html + doc[i2:]
  270. return doc
  271. def fix_move_href_tags(doc):
  272. """
  273. Return copy of doc with all MOVE_HREF tags removed.
  274. """
  275. while '<' + MOVE_HREF in doc:
  276. i1 = doc.index('<' + MOVE_HREF)
  277. i2 = doc.index('</' + MOVE_HREF, i1 + 1)
  278. i3 = doc.index('>', i2 + 1)
  279. (start, end) = (i1, i3 + 1)
  280. tags = htmldata.tagextract(doc[start:end])
  281. assert tags[0][0] == MOVE_HREF
  282. assert tags[-1][0] == '/' + MOVE_HREF
  283. href = tags[0][1].get('href', '')
  284. new_tags = []
  285. for tag in tags[1:-1]:
  286. if len(tag) == 2:
  287. if 'href' in tag[1]:
  288. if href == '':
  289. continue
  290. tag[1]['href'] = href
  291. new_tags += [tag]
  292. doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
  293. return doc
  294. def html_remove_image_history(doc):
  295. """
  296. Remove image history and links to information.
  297. """
  298. doc = re.sub(r'<h2>Image history</h2>[\s\S]+?</ul>', r'', doc)
  299. doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc)
  300. return doc
  301. def html_remove_translation_links(doc):
  302. """
  303. Remove translation links (the international flags).
  304. We identify them by the pattern for a 2 or 3 letter language code, /[a-z]{2,3}[/"]
  305. in the URL.
  306. The second version deals with links like /pt_PT and /zh_CN
  307. We are case sensitive, so as not to treat FAQ as a language code.
  308. """
  309. doc = re.sub(r'<a href="[^"]+/[a-z]{2,3}[/"][\s\S]+?</a>', r'', doc)
  310. doc = re.sub(r'<a href="[^"]+/[a-z]{2}_[A-Z]{2}[/"][\s\S]+?</a>', r'', doc)
  311. return doc
  312. def monobook_hack_skin_html(doc):
  313. """
  314. Hacks Monobook HTML output: use CSS ids for hacked skin.
  315. See monobook_hack_skin_css.
  316. """
  317. doc = doc.replace('<div id="globalWrapper">', '<div id="globalWrapperHacked">')
  318. doc = doc.replace('<div id="footer">', '<div id="footerHacked">')
  319. doc = doc.replace('</body>', '<br></body>')
  320. return doc
  321. def monobook_hack_skin_css(doc, url):
  322. """
  323. Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks.
  324. Removes flower background. Defines *Hacked CSS ids, so we can add
  325. an orange bar at the top, and clear the orange bar right above the
  326. footer.
  327. """
  328. global config
  329. if not url.endswith('monobook/main.css'):
  330. return doc
  331. doc = "/* Monobook skin automatically modified by mw2html. */" + doc
  332. doc = doc.replace('url("headbg.jpg")', '')
  333. doc += """
  334. /* Begin hacks by mw2html */
  335. #globalWrapperHacked {
  336. font-size:127%;
  337. width: 100%;
  338. background-color: White;
  339. border-top: 1px solid #fabd23;
  340. border-bottom: 1px solid #fabd23;
  341. margin: 0.6em 0em 1em 0em;
  342. padding: 0em 0em 1.2em 0em;
  343. }
  344. #footerHacked {
  345. background-color: White;
  346. margin: 0.6em 0em 0em 0em;
  347. padding: 0.4em 0em 0em 0em;
  348. text-align: center;
  349. font-size: 90%;
  350. }
  351. #footerHacked li {
  352. display: inline;
  353. margin: 0 1.3em;
  354. }
  355. """
  356. c1 = '#column-one { padding-top: 160px; }'
  357. c2 = '#column-one { padding-top: 3.0em; }'
  358. assert c1 in doc
  359. doc = doc.replace(c1, '/* edit by mw2html */\n' + c2 +
  360. '\n/* end edit by mw2html */\n')
  361. doc = doc.replace('h3 { font-size: 90%; }', 'h3 { font-size: 130%; }')
  362. # Remove external link icons.
  363. if config.remove_png:
  364. doc = re.sub(r'#bodyContent a\[href \^="https://"\][\s\S]+?\}', r'', doc)
  365. return doc
  366. def post_css_transform(doc, url):
  367. """
  368. User-customizable CSS transform.
  369. Given a CSS document (with URLs already rewritten), returns
  370. modified CSS document.
  371. """
  372. global config
  373. if config.hack_skin and not config.special_mode:
  374. if config.skin == MONOBOOK_SKIN:
  375. doc = monobook_hack_skin_css(doc, url)
  376. else:
  377. raise ValueError('unknown skin')
  378. return doc
  379. def move_to_index_if_needed(ans):
  380. global config
  381. if ans.endswith(config.index):
  382. ans = ans[:len(ans) - len(config.index)] + INDEX_HTML
  383. return ans
  384. def file_exists_in_written_set(filename):
  385. return os.path.normcase(os.path.normpath(filename)) in wrote_file_set
  386. def find_unused_filename(filename, exists=os.path.exists):
  387. """
  388. Return 'file' if 'file' doesn't exist, otherwise 'file1', 'file2', etc.
  389. Existence is determined by the callable exists(), which takes
  390. a filename and returns a boolean.
  391. """
  392. if not exists(filename):
  393. return filename
  394. (head, tail) = os.path.split(filename)
  395. i = 1
  396. while True:
  397. numbered = (os.path.splitext(tail)[0] + str(i) +
  398. os.path.splitext(tail)[1])
  399. fullname = os.path.join(head, numbered)
  400. if not exists(fullname):
  401. return fullname
  402. i += 1
  403. def clean_filename(url, ans):
  404. # Split outdir and our file/dir under outdir
  405. # (Note: ans may not be a valid filename)
  406. global config
  407. (par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):])
  408. if ans.startswith(os.sep):
  409. ans = ans[1:]
  410. # Replace % escape codes with underscores, dashes with underscores.
  411. while '%%' in ans:
  412. ans = ans[:ans.index('%%')] + '_' + ans[ans.index('%%') + 2:]
  413. while '%25' in ans:
  414. ans = ans[:ans.index('%25')] + '_' + ans[ans.index('%25') + 5:]
  415. while '%' in ans:
  416. ans = ans[:ans.index('%')] + '_' + ans[ans.index('%') + 3:]
  417. ans = ans.replace('-', '_')
  418. while '__' in ans:
  419. ans = ans.replace('__', '_')
  420. while '_.' in ans:
  421. ans = ans.replace('_.', '.')
  422. # Rename math thumbnails
  423. if '/math/' in url:
  424. tail = os.path.split(ans)[1]
  425. if os.path.splitext(tail)[1] == '.png':
  426. tail = os.path.splitext(tail)[0]
  427. if set(tail) <= set('0123456789abcdef') and len(tail) == 32:
  428. ans = 'math_' + hashlib.md5(tail).hexdigest()[:4] + '.png'
  429. return os.path.join(par, ans)
  430. def flatten_filename(url, filename):
  431. global config
  432. def get_fullname(relname):
  433. return os.path.join(config.outdir, relname)
  434. orig_ext = os.path.splitext(filename)[1]
  435. (head, tail) = os.path.split(filename)
  436. if tail == INDEX_HTML:
  437. (head, tail) = os.path.split(head)
  438. ans = tail
  439. if os.path.splitext(ans)[1] != orig_ext:
  440. ans = os.path.splitext(ans)[0] + orig_ext
  441. return os.path.join(config.outdir, ans)
  442. def split_section(url):
  443. """
  444. Splits into (head, tail), where head contains no '#' and is max length.
  445. """
  446. if '#' in url:
  447. i = url.index('#')
  448. return (url[:i], url[i:])
  449. else:
  450. return (url, '')
  451. def url_open(url):
  452. # download a file and retrieve its content and mimetype
  453. global conn, domain, counter, redir_cache, errors, headers
  454. l_redir = []
  455. redirect = url
  456. while redirect != '':
  457. l_redir += [url]
  458. L = urllib.parse.urlparse(url)
  459. if L[1] != domain:
  460. conn.close()
  461. if L[1] == '': return(['',''])
  462. print("connection to", domain, "closed.")
  463. conn = http.client.HTTPSConnection(L[1])
  464. domain = L[1]
  465. print("connection to", domain, "opened.")
  466. rel_url = url
  467. pos = url.find(domain)
  468. if pos != -1:
  469. rel_url = url[pos + len(domain):]
  470. attempts = 0
  471. #number of attempts
  472. total_attempts = 3
  473. recovered = False
  474. success = False
  475. while not success and attempts < total_attempts:
  476. #increment httplib requests counter
  477. counter += 1
  478. try:
  479. conn.request("GET", rel_url,headers=headers)
  480. r = conn.getresponse()
  481. print('Status', r.status, r.reason, 'accessing', rel_url)
  482. if r.status == 404:
  483. print(" it's not possible to recover this error.")
  484. errors += 1
  485. return ('', '')
  486. if r.status == 500:
  487. print(" eventually this error might be recovered. let's try again.")
  488. print(' reconnecting...')
  489. conn = http.client.HTTPSConnection(domain)
  490. attempts += 1
  491. continue
  492. if r.status == 403:
  493. print(" that shouldn't happen, but let's try again anyway.")
  494. print(' reconnecting...')
  495. conn = http.client.HTTPSConnection(domain)
  496. attempts += 1
  497. continue
  498. if attempts != 0:
  499. recovered = True
  500. if r.status != 200:
  501. print(" Status other than 200, 404, 500, 403. It is: ", r.status)
  502. success = True
  503. except http.client.HTTPException as e:
  504. print('ERROR', e.__class__.__name__, 'while retrieving', url)
  505. conn.close
  506. if e.__class__.__name__ in ['BadStatusLine', 'ImproperConnectionState', 'NotConnected', 'IncompleteRead', 'ResponseNotReady']:
  507. print("eventually this error might be recovered. let's try again.")
  508. print('reconnecting...')
  509. conn = http.client.HTTPSConnection(domain)
  510. attempts += 1
  511. else:
  512. print("it's not possible to recover this error.")
  513. errors += 1
  514. return ('', '')
  515. if recovered:
  516. print("error recovered")
  517. if not success:
  518. print("it was not possible to recover this error.")
  519. errors += 1
  520. return ('', '')
  521. redirect = r.getheader('Location', '').split(';')[0]
  522. if redirect != "":
  523. url = redirect
  524. else:
  525. doc = r.read()
  526. for item in l_redir:
  527. redir_cache[normalize_url(item)] = normalize_url(url)
  528. mimetype = r.getheader('Content-Type', '').split(';')[0].lower()
  529. return (doc, mimetype)
  530. def url_to_filename(url):
  531. """
  532. Translate a full url to a full filename (in local OS format) under outdir.
  533. Transforms web url into local url and caches it.
  534. Downloads the file to disk and works with it there instead of download the same file two times (Performance Improvement).
  535. """
  536. global config
  537. nurl = normalize_url(url)
  538. if nurl in url_filename_cache:
  539. return url_filename_cache[nurl]
  540. #ParseResult(scheme='http', netloc='www.cwi.nl:80', path='/%7Eguido/Python.html', params='', query='', fragment='')
  541. turl = re.sub(r'm/index.php\?title=', r'man/', nurl)
  542. turl = re.sub(r'.css&[\S\s]+', r'.css', turl)
  543. L = list(urllib.parse.urlparse(turl))
  544. #this way the url will not create a folder outside of the maindomain
  545. droot = get_domain(config.rooturl)
  546. if (L[1] != droot):
  547. L[1] = droot
  548. L[2] = L[2].strip('/')
  549. lpath = L[2].split('/')
  550. if not '.' in lpath[-1]:
  551. # url ends with a directory name. Store it under index.html.
  552. # L[2] += '/' + INDEX_HTML
  553. L[2]=L[2]
  554. else:
  555. # 'title=' parsing
  556. if L[4].startswith('title=') and L[2].endswith('index.php'):
  557. L[4] = L[4][len('title='):]
  558. L[2] = L[2][:-len('index.php')]
  559. if lpath[-1]=='man':
  560. L[2] = INDEX_HTML
  561. if lpath[-1].lower().startswith( 'quick_help'):
  562. L[2] = QHELP_HTML
  563. L[3] = ''
  564. L[2] = L[2].strip('/')
  565. #don't sanitize / for path
  566. L[0] = ''
  567. L[2] = urllib.parse.quote_plus(L[2],'/')
  568. L[3] = urllib.parse.quote_plus(L[3])
  569. L[4] = urllib.parse.quote_plus(L[4])
  570. L[5] = urllib.parse.quote_plus(L[5])
  571. # Local filename relative to outdir
  572. # os.sep - O.S. directory separator
  573. # (More transformations are made to this below...).
  574. FL = []
  575. for i in L:
  576. if i != '':
  577. FL += [i]
  578. subfile = os.sep.join(FL)
  579. (doc, mimetype) = url_open(url)
  580. if doc == '' or mimetype == '':
  581. url_filename_cache[nurl] = ''
  582. return ''
  583. # Fix up extension based on mime type.
  584. # Maps mimetype to file extension
  585. MIME_MAP = {
  586. 'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif',
  587. 'image/tiff': 'tiff', 'text/plain': 'txt', 'text/html': 'html',
  588. 'text/rtf': 'rtf', 'text/css': 'css', 'text/sgml': 'sgml',
  589. 'text/xml': 'xml', 'application/zip': 'zip'
  590. }
  591. if mimetype in MIME_MAP:
  592. (root, ext) = os.path.splitext(subfile)
  593. ext = '.' + MIME_MAP[mimetype]
  594. subfile = root + ext
  595. subfile = subfile.lower()
  596. ans = os.path.join(config.outdir, subfile)
  597. if config.flatten:
  598. ans = flatten_filename(nurl, ans)
  599. if config.clean:
  600. ans = clean_filename(nurl, ans)
  601. if config.index != None:
  602. ans = move_to_index_if_needed(ans)
  603. ans = find_unused_filename(ans, file_exists_in_written_set)
  604. # Cache and return answer.
  605. wrote_file_set.add(os.path.normcase(os.path.normpath(ans)))
  606. url_filename_cache[nurl] = ans
  607. # Make parent directory if it doesn't exist.
  608. try:
  609. os.makedirs(os.path.split(ans)[0])
  610. except OSError as e:
  611. if e.errno != errno.EEXIST:
  612. raise
  613. # Not really needed since we checked that the directory
  614. # outdir didn't exist at the top of run(), but let's double check.
  615. if os.path.exists(ans) and not config.overwrite:
  616. out.write('File already exists: ' + str(ans)) #@UndefinedVariable
  617. sys.exit(1)
  618. if mimetype.startswith('text'):
  619. f = open(ans, 'w', encoding='utf8')
  620. doc = str(doc)
  621. else:
  622. f = open(ans, 'wb')
  623. f.write(doc)
  624. f.close()
  625. return ans
  626. def url_to_relative(url, cururl):
  627. """
  628. Translate a full url to a filename (in URL format) relative to cururl.
  629. Relative url from curul to url.
  630. """
  631. cururl = split_section(cururl)[0]
  632. (url, section) = split_section(url)
  633. L1 = url_to_filename(url).replace(os.sep, '/').strip('/').split('/')
  634. if L1 == '':
  635. return ''
  636. L2 = url_to_filename(cururl).replace(os.sep, '/').strip('/').split('/')
  637. while L1 != [] and L2 != [] and L1[0] == L2[0]:
  638. L1 = L1[1:]
  639. L2 = L2[1:]
  640. rel_url = urllib.parse.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
  641. if rel_url == '':
  642. return '#'
  643. else:
  644. return rel_url
  645. def parse_css(doc, url):
  646. """
  647. Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  648. all links found in the CSS.
  649. """
  650. global config
  651. new_urls = []
  652. L = htmldata.urlextract(doc, url, 'text/css')
  653. for item in L:
  654. # Store url locally.
  655. u = item.url
  656. if config.no_images and any(u.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
  657. item.url = ''
  658. continue
  659. new_urls += [u]
  660. item.url = url_to_relative(u, url)
  661. newdoc = htmldata.urljoin(doc, L)
  662. newdoc = post_css_transform(newdoc, url)
  663. return (newdoc, new_urls)
  664. def should_follow(url):
  665. """
  666. Returns a boolean for whether url should be spidered
  667. Given that 'url' was linked to from site, return whether
  668. 'url' should be spidered as well.
  669. """
  670. global config
  671. # we don't have search on the local version
  672. if (url.endswith('#searchInput')):
  673. return False
  674. # False if different domains.
  675. nurl = normalize_url(url)
  676. droot = get_domain(config.rooturl)
  677. dn = get_domain(nurl)
  678. #if droot != dn and not (dn.endswith(droot) or droot.endswith(dn)):
  679. if droot != dn:
  680. if config.debug:
  681. print(url, 'not in the same domain')
  682. return False
  683. # False if multiple query fields or parameters found
  684. if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in ('.css', 'gen=css')):
  685. if config.debug:
  686. print(url, 'with multiple query fields')
  687. return False
  688. if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:', 'User_talk:', 'MediaWiki_talk:', 'File:', 'action=edit', 'title=-')):
  689. if config.debug:
  690. print(url, 'is a forbidden wiki page')
  691. return False
  692. if config.no_images and any(url.strip().lower().endswith(suffix) for suffix in ('.jpg', '.gif', '.png', '.ico')):
  693. if config.debug:
  694. print(url, 'is a image and you are in no-images mode')
  695. return False
  696. if any(url.strip().lower().endswith(suffix) for suffix in ('.zip', '.7z')):
  697. if config.debug:
  698. print(url, 'is a compressed file')
  699. return False
  700. # limit_parent support
  701. ncurl = normalize_url(config.rooturl)
  702. if config.limit_parent and not nurl.startswith(ncurl):
  703. L = nurl.split('/')
  704. if ('.' not in L[-1]):
  705. if config.debug:
  706. print(url, 'is a file outside of scope with unknown extension')
  707. return False
  708. # JKC: we do allow css from 'strange' places.
  709. if '.css' in L[-1]:
  710. return True
  711. forbidden_parents = ['.php', '.html', '.htm']
  712. for fp in forbidden_parents:
  713. if fp in L[-1]:
  714. if config.debug:
  715. print(url, 'is a page outside of scope')
  716. return False
  717. return True
  718. def parse_html(doc, url, filename):
  719. """
  720. Returns (modified_doc, new_urls), where new_urls are absolute URLs for
  721. all links we want to spider in the HTML.
  722. """
  723. global config
  724. global counter
  725. BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
  726. END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'
  727. new_urls = []
  728. doc = pre_html_transform(doc, url)
  729. # Temporarily "get rid" of comments so htmldata will find the URLs
  730. # in the funky "<!--[if" HTML hackery for IE.
  731. doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
  732. doc = doc.replace('-->', END_COMMENT_REPLACE)
  733. L = htmldata.urlextract(doc, url, 'text/html')
  734. # in this code we change each absolute url in L
  735. # into a relative one.
  736. # we also kick-off zillions of subthreads to collect
  737. # more pages.
  738. for item in L:
  739. u = item.url
  740. follow = should_follow(u) #and (counter < 10)
  741. if follow:
  742. if config.debug:
  743. print('ACCEPTED - ', u)
  744. # Store url locally.
  745. new_urls += [u]
  746. item.url = url_to_relative(u, url)
  747. else:
  748. # James, let's keep everything by default (but not follow it).
  749. # if not any( license in u for license in ('creativecommons.org', 'wxwidgets.org', 'gnu.org', 'mediawiki.org') ):
  750. # item.url = ''
  751. if config.debug:
  752. print('NOT INCLUDED - ', u)
  753. newdoc = htmldata.urljoin(doc, L)
  754. newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
  755. newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
  756. newdoc = pos_html_transform(newdoc, url,filename)
  757. # remove the comments.
  758. p = re.compile( '<!--.*?-->', re.DOTALL)
  759. newdoc = p.sub( '', newdoc )
  760. # Remove byte artifacts in string
  761. newdoc = newdoc.replace('\\n','\n')
  762. newdoc = newdoc.replace('\\t', '\t')
  763. newdoc = newdoc.replace('\\\'', '\'')
  764. newdoc = newdoc.replace('\\\\', '\\')
  765. newdoc = newdoc.replace('\\xe2\\x80\\x99','\'')
  766. newdoc = newdoc.replace('\\xe2\\x80\\x90', '-')
  767. newdoc = newdoc.strip('b')
  768. newdoc = newdoc.strip('\'')
  769. newdoc = newdoc.strip('')
  770. return (newdoc, new_urls)
  771. def deploy_file( src, dest ):
  772. src_dir = os.path.dirname(os.path.realpath(__file__))
  773. src = os.path.join(src_dir, src)
  774. dest = os.path.join(config.outdir, dest)
  775. print("copying from", src, "to", dest)
  776. directory = os.path.dirname(dest)
  777. if not os.path.exists(directory):
  778. os.makedirs(directory)
  779. copyfile(src,dest)
  780. def run(out=sys.stdout):
  781. """
  782. Code interface.
  783. """
  784. global conn, domain, counter, redir_cache, config, headers
  785. if urllib.parse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
  786. out.write('Please do not use robots with the Wikipedia site.\n')
  787. out.write('Instead, install the Wikipedia database locally and use mw2html on\n')
  788. out.write('your local installation. See the Mediawiki site for more information.\n')
  789. sys.exit(1)
  790. # Number of files saved
  791. n = 0
  792. if not config.overwrite and os.path.exists(config.outdir):
  793. out.write('Error: Directory exists: ' + str(config.outdir))
  794. sys.exit(1)
  795. domain = get_domain(config.rooturl)
  796. conn = http.client.HTTPSConnection(domain)
  797. print('connection established to:', domain)
  798. complete = set()
  799. pending = set([config.rooturl])
  800. start = True
  801. while len(pending) > 0:
  802. url = pending.pop()
  803. nurl = normalize_url(url)
  804. if nurl in redir_cache:
  805. nurl = redir_cache[nurl]
  806. if nurl in complete:
  807. if config.debug:
  808. print(url, 'already processed')
  809. continue
  810. complete.add(nurl)
  811. filename = url_to_filename(url)
  812. #this is needed for the first path as it doesn't know if it is a redirect or not in the beginning
  813. #at this point all the content of redir_cache is relative to the start path
  814. if start:
  815. start = False
  816. aux_url = ''
  817. for redir in redir_cache.keys():
  818. aux_url = normalize_url(redir)
  819. url_filename_cache[aux_url] = filename
  820. if aux_url not in complete:
  821. complete.add(aux_url)
  822. if aux_url != '':
  823. nurl = normalize_url(redir_cache[nurl])
  824. if filename == '':
  825. continue
  826. if not os.path.exists(filename):
  827. print("ERROR: ", url, '\n')
  828. continue
  829. # These formats are encoded as text. Everything else is read as bytes
  830. text_ext = ('txt', 'html', 'rtf', 'css', 'sgml', 'xml')
  831. if not filename.endswith(text_ext):
  832. f = open(filename, 'rb')
  833. else:
  834. f = open(filename, 'r')
  835. doc = f.read()
  836. f.close()
  837. new_urls = []
  838. if filename.endswith('.html'):
  839. (doc, new_urls) = parse_html(doc, url, filename)
  840. elif filename.endswith('.css'):
  841. (doc, new_urls) = parse_css(doc, url)
  842. # Save document changes to disk
  843. # The unmodified file already exists on disk.
  844. update = False
  845. for ext in text_ext:
  846. if filename.endswith(ext):
  847. update = True
  848. break
  849. if update:
  850. f = open(filename, 'w')
  851. f.write(doc)
  852. f.close()
  853. if config.debug:
  854. out.write(url + '\n => ' + filename + '\n\n')
  855. n += 1
  856. # Enqueue URLs that we haven't yet spidered.
  857. for u in new_urls:
  858. if normalize_url(u) not in complete:
  859. # Strip off any #section link.
  860. if '#' in u:
  861. u = u[:u.index('#')]
  862. pending.add(u)
  863. conn.close()
  864. print("connection to", domain, "closed.")
  865. out.write(str(n) + ' files saved\n')
  866. print(counter, "httplib requests done")
  867. print(errors, "errors not recovered")
  868. # use / not \ so as to work on both windows and mac.
  869. deploy_file( "AudacityLogo.png", r"alphamanual.audacityteam.org/m/resources/assets/AudacityLogo.png")
  870. deploy_file( "303.css", r"alphamanual.audacityteam.org/m/skins/monobook/main.css/303.css")
  871. deploy_file( "headbg.jpg", r"alphamanual.audacityteam.org/m/skins/monobook/headbg.jpg")
  872. deploy_file( "audio.png", r"alphamanual.audacityteam.org/m/skins/monobook/audio.png")
  873. deploy_file( "bullet.gif", r"alphamanual.audacityteam.org/m/skins/monobook/bullet.gif")
  874. deploy_file( "external.png", r"alphamanual.audacityteam.org/m/skins/monobook/external.png")
  875. deploy_file( "external_rtl.png", r"alphamanual.audacityteam.org/m/skins/monobook/external_rtl.png")
  876. deploy_file( "user.gif", r"alphamanual.audacityteam.org/m/skins/monobook/user.gif")
  877. deploy_file( "video.png", r"alphamanual.audacityteam.org/m/skins/monobook/video.png")
  878. def usage():
  879. """
  880. Print command line options.
  881. """
  882. usage_str = """
  883. mw2html url outdir [options]
  884. MW2HTML Audacity version
  885. Converts an entire Mediawiki site into static HTML.
  886. WARNING: This is a recursive robot that ignores robots.txt. Use with care.
  887. url - URL of mediawiki page to convert to static HTML.
  888. outdir - Output directory.
  889. -f, --force - Overwrite existing files in outdir.
  890. -d, --debug - Debug mode.
  891. -s, --special-mode - -f --no-flatten --limit-parent -l sidebar.html
  892. -b footer.html, keeps MediaWiki icon and more
  893. design changes.
  894. --no-flatten - Do not flatten directory structure.
  895. --no-clean - Do not clean up filenames (clean replaces
  896. non-alphanumeric chars with _, renames math thumbs).
  897. --no-hack-skin - Do not modify skin CSS and HTML for looks.
  898. --no-made-by - Suppress "generated by" comment in HTML source.
  899. --no-move-href - Disable <movehref> tag. [1]
  900. --no-remove-png - Retain external link PNG icons.
  901. --no-remove-history - Retain image history and links to information.
  902. --no-images - Discard images
  903. --limit-parent - Do not explore .php pages outside the url path
  904. (outside css, images and other files aren't affected)
  905. -l, --left=a.html - Paste HTML fragment file into left sidebar.
  906. -t, --top=a.html - Paste HTML fragment file into top horiz bar.
  907. -b, --bottom=a.html - Paste HTML fragment file into footer horiz bar.
  908. -i, --index=filename - Move given filename in outdir to index.html.
  909. Example Usage:
  910. mw2html http://127.0.0.1/mywiki/ out -f -i main_page.html -l sidebar.html
  911. Freezes wiki into 'out' directory, moves main_page.html => index.html,
  912. assumes sidebar.html is defined in the current directory.
  913. [1]. The <movehref> tag.
  914. Wiki syntax: <html><movehref href="a"></html>...<html></movehref></html>.
  915. When enabled, this tag will cause all href= attributes inside of it to be
  916. set to the given location. This is useful for linking images.
  917. In MediaWiki, for the <html> tag to work, one needs to enable $wgRawHtml
  918. and $wgWhitelistEdit in LocalSettings.php. A <movehref> tag with no href
  919. field will remove all links inside it.
  920. """
  921. print(textwrap.dedent(usage_str.strip('\n')))
  922. sys.exit(1)
  923. def main():
  924. """
  925. Command line interface.
  926. """
  927. global config
  928. try:
  929. (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fsdl:t:b:i:',
  930. ['force', 'no-flatten', 'no-clean',
  931. 'no-hack-skin', 'no-made-by', 'left=',
  932. 'top=', 'bottom=', 'index=', 'no-move-href',
  933. 'no-remove-png', 'no-remove-history', 'limit-parent',
  934. 'special-mode', 'debug', 'no-images'])
  935. except getopt.GetoptError:
  936. usage()
  937. # Parse non-option arguments
  938. try:
  939. (rooturl, outdir) = args
  940. except ValueError:
  941. usage()
  942. config = Config(rooturl=rooturl, outdir=outdir)
  943. # Parse option arguments
  944. for (opt, arg) in opts:
  945. if opt in ['-f', '--force', '-s', '-special-mode']:
  946. config.overwrite = True
  947. if opt in ['--no-flatten', '-s', '-special-mode']:
  948. config.flatten = False
  949. if opt in ['--no-clean']:
  950. config.clean = False
  951. if opt in ['--no-hack-skin']:
  952. config.hack_skin = False
  953. if opt in ['--no-made-by']:
  954. config.made_by = False
  955. if opt in ['--no-move-href']:
  956. config.move_href = False
  957. if opt in ['--no-remove-png']:
  958. config.remove_png = False
  959. if opt in ['--no-remove-history']:
  960. config.remove_history = False
  961. if opt in ['--no-images']:
  962. config.no_images = True
  963. if opt in ['--limit-parent', '-s', '-special-mode']:
  964. config.limit_parent = True
  965. if opt in ['-s', '-special-mode']:
  966. config.special_mode = True
  967. config.sidebar = 'sidebar.html'
  968. config.footer = 'footer.html'
  969. if opt in ['-d', '--debug']:
  970. config.debug = True
  971. if opt in ['-l', '--left']:
  972. config.sidebar = os.path.abspath(arg)
  973. if opt in ['-t', '--top']:
  974. raise NotImplementedError
  975. config.header = os.path.abspath(arg)
  976. if opt in ['-b', '--bottom']:
  977. config.footer = os.path.abspath(arg)
  978. if opt in ['-i', '--index']:
  979. config.index = arg
  980. # Run program
  981. run()
  982. if __name__ == '__main__':
  983. main()