python实现知乎高颜值图片爬取


Posted in Python onAugust 12, 2019

导入相关包

import time
import pydash
import base64
import requests
from lxml import etree
from aip import AipFace
from pathlib import Path

百度云 人脸检测 申请信息

#唯一必须填的信息就这三行
APP_ID = "xxxxxxxx"
API_KEY = "xxxxxxxxxxxxxxxx"
SECRET_KEY = "xxxxxxxxxxxxxxxx"
# 过滤颜值阈值,存储空间大的请随意
BEAUTY_THRESHOLD = 55
AUTHORIZATION = "oauth c3cef7c66a1843f8b3a9e6a1e3160e20"
# 如果权限错误,浏览器中打开知乎,在开发者工具复制一个,无需登录
# 建议最好换一个,因为不知道知乎的反爬虫策略,如果太多人用同一个,可能会影响程序运行

以下皆无需改动

# 每次请求知乎的讨论列表长度,不建议设定太长,注意节操
LIMIT = 5
# 这是话题『美女』的 ID,其是『颜值』(20013528)的父话题
SOURCE = "19552207"

爬虫假装下正常浏览器请求

USER_AGENT = "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.5 Safari/534.55.3"
REFERER = "https://www.zhihu.com/topic/%s/newest" % SOURCE
# 某话题下讨论列表请求 url
BASE_URL = "https://www.zhihu.com/api/v4/topics/%s/feeds/timeline_activity"
# 初始请求 url 附带的请求参数
URL_QUERY = "?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.comment_count&limit=" + str(
  LIMIT)

HEADERS = {
  "User-Agent": USER_AGENT,
  "Referer": REFERER,
  "authorization": AUTHORIZATION

指定 url,获取对应原始内容 / 图片

def fetch_image(url):
  try:
    response = requests.get(url, headers=HEADERS)
  except Exception as e:
    raise e
  return response.content

指定 url,获取对应 JSON 返回 / 话题列表

def fetch_activities(url):
  try:
    response = requests.get(url, headers=HEADERS)
  except Exception as e:
    raise e
  return response.json()

处理返回的话题列表

def parser_activities(datums, face_detective):
  for data in datums["data"]:
    target = data["target"]
    if "content" not in target or "question" not in target or "author" not in target:
      continue
    html = etree.HTML(target["content"])
    seq = 0
    title = target["question"]["title"]
    author = target["author"]["name"]
    images = html.xpath("//img/@src")
    for image in images:
      if not image.startswith("http"):
        continue
      image_data = fetch_image(image)
      score = face_detective(image_data)
      if not score:
        continue
      name = "{}--{}--{}--{}.jpg".format(score, author, title, seq)
      seq = seq + 1
      path = Path(__file__).parent.joinpath("image").joinpath(name)
      try:
        f = open(path, "wb")
        f.write(image_data)
        f.flush()
        f.close()
        print(path)
        time.sleep(2)
      except Exception as e:
        continue
  if not datums["paging"]["is_end"]:
    return datums["paging"]["next"]
  else:
    return None

初始化颜值检测工具

def init_detective(app_id, api_key, secret_key):
  client = AipFace(app_id, api_key, secret_key)
  options = {"face_field": "age,gender,beauty,qualities"}
  def detective(image):
    image = str(base64.b64encode(image), "utf-8")
    response = client.detect(str(image), "BASE64", options)
    response = response.get("result")
    if not response:
      return
    if (not response) or (response["face_num"] == 0):
      return
    face_list = response["face_list"]
    if pydash.get(face_list, "0.face_probability") < 0.6:
      return
    if pydash.get(face_list, "0.beauty") < BEAUTY_THRESHOLD:
      return
    if pydash.get(face_list, "0.gender.type") != "female":
      return
    score = pydash.get(face_list, "0.beauty")
    return score
  return detective

程序入口

def main():
  face_detective = init_detective(APP_ID, API_KEY, SECRET_KEY)
  url = BASE_URL % SOURCE + URL_QUERY
  while url is not None:
    datums = fetch_activities(url)
    url = parser_activities(datums, face_detective)
    time.sleep(5)
if __name__ == '__main__':
  main()

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持三水点靠木。

Python 相关文章推荐
Python中的并发编程实例
Jul 07 Python
Python去除字符串两端空格的方法
May 21 Python
详解使用Python处理文件目录的相关方法
Oct 16 Python
python中的错误处理
Apr 10 Python
django反向解析URL和URL命名空间的方法
Jun 05 Python
Python变量访问权限控制详解
Jun 29 Python
Python封装成可带参数的EXE安装包实例
Aug 24 Python
python创建学生管理系统
Nov 22 Python
python能在浏览器能运行吗
Jun 17 Python
python爬虫请求头设置代码
Jul 28 Python
Python 读取位于包中的数据文件
Aug 07 Python
Python实现的扫码工具居然这么好用!
Jun 07 Python
python3 enum模块的应用实例详解
Aug 12 #Python
Python一键查找iOS项目中未使用的图片、音频、视频资源
Aug 12 #Python
django+echart数据动态显示的例子
Aug 12 #Python
Flask框架学习笔记之使用Flask实现表单开发详解
Aug 12 #Python
Flask框架学习笔记之表单基础介绍与表单提交方式
Aug 12 #Python
python内存管理机制原理详解
Aug 12 #Python
Flask框架学习笔记之路由和反向路由详解【图文与实例】
Aug 12 #Python
You might like
PHP代码网站如何防范SQL注入漏洞攻击建议分享
2012/03/01 PHP
PHP对象Object的概念 介绍
2012/06/14 PHP
基于header的一些常用指令详解
2013/06/06 PHP
PHP不用递归遍历目录下所有文件的代码
2014/07/04 PHP
php几个预定义变量$_SERVER用法小结
2014/11/07 PHP
php让json_encode不自动转义斜杠“/”的方法
2020/04/27 PHP
JavaScript 模拟用户单击事件
2009/12/31 Javascript
jQuery动画特效实例教程
2014/08/29 Javascript
Javascript实现可旋转的圆圈实例代码
2015/08/04 Javascript
jQuery中借助deferred来请求及判断AJAX加载的实例讲解
2016/05/24 Javascript
Bootstrap学习笔记之环境配置(1)
2016/12/07 Javascript
微信小程序中实现一对多发消息详解及实例代码
2017/02/14 Javascript
JavaScript设计模式之构造函数模式实例教程
2018/07/02 Javascript
vue中$set的使用(结合在实际应用中遇到的坑)
2018/07/10 Javascript
基于vue.js中关于下拉框的值默认及绑定问题
2018/08/22 Javascript
jquery图片预览插件实现方法详解
2019/07/18 jQuery
Vue项目接入Paypal实现示例详解
2020/06/04 Javascript
[48:27]EG vs Liquid 2018国际邀请赛淘汰赛BO3 第二场 8.25
2018/08/29 DOTA
[48:48]完美世界DOTA2联赛PWL S3 Magama vs GXR 第一场 12.19
2020/12/24 DOTA
python 简易计算器程序,代码就几行
2009/08/29 Python
Python实现全局变量的两个解决方法
2014/07/03 Python
Python模块包中__init__.py文件功能分析
2016/06/14 Python
今天 平安夜 Python 送你一顶圣诞帽 @微信官方
2017/12/25 Python
解决pandas 作图无法显示中文的问题
2018/05/24 Python
Python多进程原理与用法分析
2018/08/21 Python
深入了解python中元类的相关知识
2019/08/29 Python
Python+redis通过限流保护高并发系统
2020/04/15 Python
python 两种方法修改文件的创建时间、修改时间、访问时间
2020/09/26 Python
CSS3中线性颜色渐变的一些实现方法
2015/07/14 HTML / CSS
美国正宗奢华复古手袋、珠宝及配饰网站:What Goes Around Comes Around
2018/07/21 全球购物
FC-Moto美国:欧洲最大的摩托车服装和头盔商店之一
2019/08/24 全球购物
《新型玻璃》教学反思
2014/04/13 职场文书
奥巴马开学演讲稿
2014/05/15 职场文书
护士自荐信范文(2016推荐篇)
2016/01/28 职场文书
浅谈MySql整型索引和字符串索引失效或隐式转换问题
2021/11/20 MySQL
nginx中封禁ip和允许内网ip访问的实现示例
2022/03/17 Servers