文章会有【原创】或者【转载】标示,部分来自Google Baidu的学习结果 {Java/PHP/Python/Ruby/Go}

Python 抓取vamei大神的文章【原创】  


brew install Caskroom/cask/wkhtmltopdf
pip install pdfkit

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from lxml import etree
import urllib
import pdfkit

def getUrlContents(url):
    urls = []
    options = {
        'page-size': 'Letter',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        # 'custom-header': [
        #     ('Accept-Encoding', 'gzip')
        # ]
    }
    html = etree.HTML(requests.get(url).content)
    res = html.xpath('//a[@class="postTitle2"]')
    for t in res:
        url = t.xpath('@href')
        title = t.xpath('text()')
        try:
            if title[0] != "":
                print(url[0])
                print(title[0])
                pdfkit.from_url(url[0], './Good/' + title[0] + '.pdf', options = options)
                print('#################')
        except Exception, ex:
                pass
                #print("Title msg: %s" % ex)

if __name__ == '__main__':
    getUrlContents('http://www.cnblogs.com/vamei/archive/2013/03/31/2991531.html')


直接读取URL 生成PDF文件