形成单文件,方便复制到移动端查看
#!/usr/bin/env python2.7
from bs4 import BeautifulSoup as bs
import base64 as b64
import urllib
import os
import re
import unicodedata
import inspect
cssContainer = '<style type="text/css"><!--\n{0}\n-->\n</style>'
jsContainer = '<script type="text/javascript"><!--\n{0}\n-->\n</style>'
imgSrcContainer = r'data:image/{0};base64,{1}'
cssUriContainer = r'data:image/png;base64,{0}'
cssImgPattern = re.compile(r'''url\((["']?)(.+?\.png)\1\)''', flags=re.I)
def printpath(path):
path, lengthList = terminalWidth(path)
#print lengthList
if sum(lengthList) > 76:
total = 0
for i in xrange(len(lengthList)-1, -1, -1):
if total <= 73:
total+=lengthList[i]
else:
cutoffset = i+2
break
path = u'...' + path[cutoffset:]
print path
def terminalWidth(string, encoding='utf-8'):
if str in inspect.getmro(string.__class__):
string = string.decode(encoding)
charWidthList = [2 if unicodedata.east_asian_width(i) in ('W', 'F') else 1
for i in string ]
return (string,charWidthList)
def get_encoding(soup):
if soup.meta is None:
encod = 'utf-8'
else:
encod = soup.meta.get('charset')
if encod == None:
encod = soup.meta.get('content-type')
if encod == None:
content = soup.meta.get('content')
match = re.search('charset=(.*)', content)
if match:
encod = match.group(1)
else:
encod = 'utf-8'
return encod
def embedCSS(soup, rootpath, encoding):
for e in soup(['style', 'link']):
try:
if e.name == 'style':
if e['type'] == 'text/css':
path = os.path.join(
rootpath,urllib.unquote(e['src'].encode(encoding)))
del e['src']
#e.extract()
else:
continue
elif e.name == 'link':
if 'stylesheet' in e.get('rel', '') or \
e.get('type', '') == 'text/css':
path = os.path.join(
rootpath, urllib.unquote(e['href'].encode(encoding)))
del e['href']
#e.extract()
else:
continue
except KeyError:
continue
try:
cssrootpath = os.path.dirname(path)
csslines = []
with open(path) as cssf:
for line in cssf:
while True:
urlpattern = cssImgPattern.search(line)
if urlpattern is None:
break
s, e = urlpattern.regs[2]
pngpath = os.path.join(cssrootpath,urlpattern.group(2))
try:
pngdata = cssUriContainer.format(
b64.b64encode(open(pngpath, 'rb').read()))
except IOError:
pngdata = ''
line = line[:s] + pngdata + line[e:]
csslines.append(line)
printpath(path)
soup.head.append(bs(cssContainer.format(''.join(csslines))).style)
except IOError:
continue
def embedJS(soup, rootpath, encoding):
for e in soup('script'):
try:
path = os.path.join(
rootpath, urllib.unquote(e['src'].encode(encoding)))
del e['src']
#e.extract()
except KeyError:
continue
try:
soup.head.append(
bs(jsContainer.format(open(path, 'rb').read())).script)
printpath(path)
except IOError:
continue
def embedImage(soup, rootpath, encoding):
for img in soup.body.findAll('img'):
imgpath = urllib.unquote(img['src'].encode(encoding))
imgpath = os.path.join(rootpath, imgpath)
imgtype = os.path.splitext(imgpath)[1].lstrip('.')
try:
imgb64data = b64.b64encode(open(imgpath, 'rb').read())
except IOError:
pass
else:
img['src'] = imgSrcContainer.format(imgtype, imgb64data)
printpath(imgpath)
def main(pathOfFile):
filename, fileext = os.path.splitext(pathOfFile)
rootpath = os.path.dirname(pathOfFile)
soup = bs(open(pathOfFile, 'rb').read())
encoding = get_encoding(soup).lower()
embedCSS(soup, rootpath, encoding)
#print soup.body
embedJS(soup, rootpath, encoding)
embedImage(soup, rootpath, encoding)
open(filename+'_single_'+fileext, 'wb').write(soup.prettify(encoding))
if __name__ == '__main__':
import sys
htmlfile = sys.argv[1]
main(htmlfile)