Skip to content

Commit 9d76bd7

Browse files
committed
add arxiv.py
1 parent 857c6b5 commit 9d76bd7

File tree

3 files changed

+190
-0
lines changed

3 files changed

+190
-0
lines changed

Python+arXiv/README.md

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Usage
2+
3+
1. run
4+
5+
```shell
6+
python arxiv.py
7+
```
8+
9+
2. input `arxiv number` and get the result, for example:
10+
11+
![image-20220419004604986](https://raw.githubusercontent.com/yzy1996/Image-Hosting/master/image-20220419004604986.png)
12+
13+
3. copy the text and (shift) paste to your markdown file.
14+
15+
[Lifting 2D StyleGAN for 3D-Aware Face Generation](https://arxiv.org/abs/2011.13126)
16+
*Yichun Shi, Divyansh Aggarwal, Anil K. Jain*
17+
**[`CVPR 2021`] (``)**
18+
19+
4. modify the information as you wish.

Python+arXiv/arxiv.py

+162
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
from PyPDF2 import PdfFileReader
2+
from urllib import request
3+
import re
4+
from soupsieve import match
5+
6+
from tqdm import trange
7+
8+
from pathlib import Path
9+
10+
'''
11+
pipeline: 输入文章标题任意格式, 得到标准模板样式
12+
'''
13+
14+
class Information():
15+
def __init__(self, query_id=None, query_title=None) -> None:
16+
17+
if query_id != None:
18+
self.query_url = f'http://export.arxiv.org/api/query?id_list={query_id}'
19+
elif query_title != None:
20+
query_title = query_title.replace(' ', '+')
21+
self.query_url = f'https://export.arxiv.org/api/query?search_query=all:{query_title}&max_results=1'
22+
23+
self.strInf = request.urlopen(self.query_url).read().decode('utf-8')
24+
25+
self._re_process()
26+
27+
# 正则表达式解析
28+
def _re_process(self):
29+
30+
Id = r'<id>http://arxiv.org/abs/(.*)</id>'
31+
Title = r'<title>([\s\S]*)</title>' # 有时候名字太长了,会换行
32+
Authors = r'<author>\s*<name>(.*)</name>\s*</author>'
33+
Year = r'<published>(\d{4}).*</published>'
34+
35+
id_version = re.findall(Id, self.strInf)[0]
36+
id = id_version[0:-2]
37+
38+
title = re.findall(Title, self.strInf)[0]
39+
title = re.sub(r'\n\s', '', title) # 去掉换行
40+
title_sub = re.sub(r'[^\w\s-]', '', title) # 去掉标点
41+
42+
authors = re.findall(Authors, self.strInf)
43+
44+
year = re.findall(Year, self.strInf)[0]
45+
46+
self.id_version = id_version
47+
self.id = id
48+
self.title = title
49+
self.title_sub = title_sub
50+
self.authors = authors
51+
self.year = year
52+
self.publish = ''
53+
self.affiliation = ''
54+
55+
self.abs_url = f'https://arxiv.org/abs/{self.id}'
56+
self.pdf_url = f'https://arxiv.org/pdf/{self.id}'
57+
58+
def _get_publish(self):
59+
60+
# 读取 txt 预定义会议名称
61+
62+
with open(r'conf_list.txt') as f:
63+
lines = [line.strip() for line in f]
64+
reg = '|'.join(lines)
65+
66+
# obtain form arxiv comments
67+
Publish = f'<arxiv:comment xmlns:arxiv="http://arxiv.org/schemas/atom">[\s\S]*(({reg}).*?\d{{4}})[\s\S]*</arxiv:comment>'
68+
publish = re.findall(Publish, self.strInf)
69+
70+
if publish != []:
71+
self.publish = publish[0][0]
72+
73+
# todo 处理例如 CVPR2020 -> CVPR 2020
74+
# re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", text)
75+
76+
else:
77+
# 未来对接整个互联网搜索
78+
self.publish = 'arXiv ' + self.year
79+
80+
def _get_affiliation(self):
81+
82+
# obtain from pdf file
83+
# 判断这个文件是否存在
84+
pdf_file = Path(f'{self.year}_{self.title_sub}.pdf')
85+
86+
if pdf_file.exists():
87+
with pdf_file.open('rb') as f:
88+
pdf = PdfFileReader(f)
89+
90+
first_page = pdf.getPage(0).extractText()
91+
first_page = first_page.split()
92+
93+
authors1 = self.authors[0].replace(' ', '')
94+
self.affiliation = first_page[first_page.index(authors1) + 1]
95+
96+
97+
def write_notes(self):
98+
99+
self._get_publish()
100+
self._get_affiliation()
101+
102+
# 组合处理
103+
title_url = f'[{self.title}]({self.abs_url}) '
104+
authors = ', '.join(self.authors)
105+
authors = f'*{authors}* '
106+
107+
publish = f'**[`{self.publish}`] (`{self.affiliation}`)** '
108+
109+
print(title_url)
110+
print(authors)
111+
print(publish)
112+
113+
114+
# download pdf from the web
115+
def download(self):
116+
117+
request.urlretrieve(self.pdf_url, f'{self.year}_{self.title_sub}.pdf')
118+
119+
120+
def verify_local_version(filename='11.pdf'):
121+
122+
with open(filename, 'rb') as f:
123+
pdf = PdfFileReader(f)
124+
first_page = pdf.getPage(0).extractText()
125+
first_page = first_page.split()
126+
127+
# 查到本地文件的版本 v-x
128+
id_version_local = first_page[-5][6:]
129+
id = id_version_local[:-2]
130+
131+
information = Information(query_id=id)
132+
133+
if information.id_version != id_version_local:
134+
135+
print('>>>Downloading the latest version!!!')
136+
information.download()
137+
138+
139+
if __name__ == "__main__":
140+
141+
# query with title
142+
# input_title = 'Image-to-Image Translation with Conditional Adversarial Networks'
143+
# information = Information(query_title=input_title)
144+
# information.write_notes()
145+
146+
# # query with id
147+
# id = '2103.13413'
148+
# information = Information(query_id=id)
149+
# information.write_notes()
150+
151+
# #
152+
# verify_local_version()
153+
while True:
154+
id = input("type id: ")
155+
156+
# 先判断 id 是否有效,形如:2103.13413
157+
if re.match(id, r'\t'):
158+
pass
159+
160+
161+
information = Information(query_id=id)
162+
information.write_notes()

Python+arXiv/conf_list.txt

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
ICML
2+
NeurIPS
3+
ICLR
4+
CVPR
5+
ICCV
6+
ECCV
7+
AAAI
8+
IJCAI
9+
3DV

0 commit comments

Comments
 (0)