编程 Python

python自动从arxiv下载paper的示例代码

Posted in Python onDecember 05, 2020

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time  : 2020/02/11 21:44
# @Author : dangxusheng
# @Email  : dangxusheng163@163.com
# @File  : download_by_href.py
'''
自动从arxiv.org 下载文献
'''

import os
import os.path as osp
import requests
from lxml import etree
from pprint import pprint
import re
import time
import glob

headers = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
  "Host": 'arxiv.org'
}

HREF_CN = 'http://cn.arxiv.org/pdf/'
HREF_SRC = 'http://cn.arxiv.org/pdf/'
SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730'
os.makedirs(SAVE_PATH, exist_ok=True)

FAIL_URLS = []
FAIL_URLS_TXT = f'{SAVE_PATH}/fail_urls.txt'


def download(url, title):
  pattern = r'[\\/:*?"\'<>|\r\n]+'
  new_title = re.sub(pattern, " ", title)
  print(f'new title: {new_title}')
  save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title)
  if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024:
    print(f'this pdf is be existed.')
    return True
  try:
    with open(save_filepath, 'wb') as file:
      # 分字节下载
      r = requests.get(url, stream=True, timeout=None)
      for i in r.iter_content(2048):
        file.write(i)
    if osp.getsize(save_filepath) >= 10 * 1024:
      print('%s 下载成功.' % title)
      return True
  except Exception as e:
    print(e)
  return False


# 从arxiv.org 去下载
def search(start_size=0, title_keywords='Facial Expression'):
  # 访问地址: https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org
  req_url = 'https://arxiv.org/search/advanced'
  req_data = {
    'advanced': 1,
    'terms-0-operator': 'AND',
    'terms-0-term': title_keywords,
    'terms-0-field': 'title',
    'classification-computer_science': 'y',
    'classification-physics_archives': 'all',
    'classification-include_cross_list': 'include',
    'date-filter_by': 'date_range', # date_range | specific_year
    # 'date-year': DOWN_YEAR,
    'date-year': '',
    'date-from_date': '2015',
    'date-to_date': '2020',
    'date-date_type': 'announced_date_first', # submitted_date | submitted_date_first | announced_date_first
    'abstracts': 'show',
    'size': 50,
    'order': '-announced_date_first',
    'start': start_size,
  }
  res = requests.get(req_url, params=req_data, headers=headers)
  html = res.content.decode()
  html = etree.HTML(html)

  total_text = html.xpath('//h1[@class="title is-clearfix"]/text()')
  total_text = ''.join(total_text).replace('\n', '').lstrip(' ').strip(' ')
  # i.e. : Showing 1?50 of 355 results
  num = re.findall('\d+', total_text)
  # Sorry, your query returned no results
  if len(num) == 0: return [], 0

  total = int(num[-1]) # 查询总条数
  paper_list = html.xpath('//ol[@class="breathe-horizontal"]/li')
  info_list = []
  for p in paper_list:
    title = p.xpath('./p[@class="title is-5 mathjax"]//text()')
    title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
    href = p.xpath('./div/p/a/@href')[0]
    info_list.append({'title': title, 'href': href})

  return info_list, total


# 去指定页面下载
def search_special():
  res = requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search')
  html = res.content.decode()
  html = etree.HTML(html)

  paper_list = html.xpath('//div[@class="file_content markdown-body"]//li')
  info_list = []
  for p in paper_list:
    title = p.xpath('.//text()')
    title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
    href = p.xpath('./a/@href')[0]
    info_list.append({'title': title, 'href': href})

  pprint(info_list)
  return info_list


if __name__ == '__main__':
  page_idx = 0
  total = 1000
  keywords = 'Facial Action Unit'
  while page_idx <= total // 50:
    paper_list, total = search(page_idx * 50, keywords)
    print(f'total: {total}')
    if total == 0:
      print('no found .')
      exit(0)

    for p in paper_list:
      title = p['title']
      href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
      print(href)
      if not download(href, title):
        print('从国内镜像下载失败，从源地址开始下载 >>>>')
        # 使用国际URL再下载一次
        href = HREF_SRC + p['href'].split('/')[-1] + '.pdf'
        if not download(href, title):
          FAIL_URLS.append(p)
    page_idx += 1

  # 下载最后的部分
  last_1 = total - page_idx * 50
  paper_list, total = search(last_1, keywords)
  for p in paper_list:
    title = p['title']
    href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
    if not download(href, title):
      FAIL_URLS.append(p)
    time.sleep(1)

  pprint(FAIL_URLS)
  with open(FAIL_URLS_TXT, 'a+') as f:
    for item in FAIL_URLS:
      href = item['href']
      title = item['title']
      f.write(href + '\n')

  print('done.')

以上就是python自动从arxiv下载paper的示例代码的详细内容，更多关于python 从arxiv下载paper的资料请关注三水点靠木其它相关文章！

- Author -

dangxusheng

声明：登载此文出于传递更多信息之目的，并不意味着赞同其观点或证实其描述。

Python 相关文章推荐

Python 字符串操作实现代码(截取/替换/查找/分割)

Jun 08 Python

浅要分析Python程序与C程序的结合使用

Apr 07 Python

Python基础知识_浅谈用户交互

May 31 Python

python 实现将字典dict、列表list中的中文正常显示方法

Jul 06 Python

浅谈django rest jwt vue 跨域问题

Oct 26 Python

Python基础之文件读取的讲解

Feb 16 Python

Python中numpy模块常见用法demo实例小结

Mar 16 Python

详解Python的三种可变参数

May 08 Python

python实现简单五子棋游戏

Jun 18 Python

Python开发之pip安装及使用方法详解

Feb 21 Python

如何在Django中使用聚合的实现示例

Mar 23 Python

快速解释如何使用pandas的inplace参数的使用

Jul 23 Python

python使用dlib进行人脸检测和关键点的示例

Dec 05 #Python

python从ftp获取文件并下载到本地

Dec 05 #Python

python基于socket模拟实现ssh远程执行命令

Dec 05 #Python

Python实现PS滤镜中的USM锐化效果

Dec 04 #Python

python 模拟登陆github的示例

Dec 04 #Python

python中round函数保留两位小数的方法

Dec 04 #Python

python中pow函数用法及功能说明

Dec 04 #Python

You might like

玩转虚拟域名◎＋ .

2006/10/09 PHP

PHP实现QQ登录实例代码

2016/01/14 PHP

PHP面向对象程序设计之多态性的应用示例

2018/12/19 PHP

精通JavaScript 纠正 cleanWhitespace函数

2010/03/11 Javascript

JavaScript实现的类字典插入或更新方法实例

2015/07/10 Javascript

Javascript使用post方法提交数据实例

2015/08/03 Javascript

JS 动态加载js文件和css文件同步/异步的两种简单方式

2016/09/23 Javascript

JavaScript页面实时显示当前时间实例代码

2016/10/23 Javascript

3分钟读懂移动端rem使用方法(推荐)

2019/05/06 Javascript

jQuery 图片查看器插件 Viewer.js用法简单示例

2020/04/04 jQuery

vue2.x 对象劫持的原理实现

2020/04/19 Javascript

React中Ref 的使用方法详解

2020/04/28 Javascript

python双向链表实现实例代码

2013/11/21 Python

教你用python3根据关键词爬取百度百科的内容

2016/08/18 Python

python交互式图形编程实例（三）

2017/11/17 Python

详解python函数传参是传值还是传引用

2018/01/16 Python

Python多进程并发与多线程并发编程实例总结

2018/02/08 Python

Win8下python3.5.1安装教程

2020/07/29 Python

Python实现的读取/更改/写入xml文件操作示例

2018/08/30 Python

django rest framework 实现用户登录认证详解

2019/07/29 Python

python_array[0][0]与array[0,0]的区别详解

2020/02/18 Python

解决TensorFlow调用Keras库函数存在的问题

2020/07/06 Python

python 将Excel转Word的示例

2021/03/02 Python

CSS超出文本指定宽度用省略号代替和文本不换行

2016/05/05 HTML / CSS

美国最灵活的移动提供商：Tello

2017/07/18 全球购物

世嘉游戏英国官方商店：SEGA Shop UK

2019/09/20 全球购物

linux面试题参考答案（9）

2015/01/07 面试题

银行职业规划书范文

2013/12/28 职场文书

双十佳事迹材料

2014/01/29 职场文书

灰雀教学反思

2014/04/28 职场文书

法语专业求职信

2014/07/20 职场文书

最美家庭活动方案

2014/08/31 职场文书

小学生作文评语集锦

2014/12/25 职场文书

2015年安康杯竞赛活动总结

2015/03/26 职场文书

立项申请报告范本

2015/05/15 职场文书

2016年三严三实党课学习心得体会

2016/01/06 职场文书