Life sucks, but you're gonna love it.

0%

whatidid | 提取CCL双语语料检索库检索数据

提取CCL双语语料库信息【简易网页内容抓取】

说明:此程序非商用,纯属个人提取数据时写的简单的抓取网页信息的程序,具体内容请参阅 CCL语料库使用说明 。感谢CCL语料库的创建和维护者。转载请标明出处

适用范围:CCL双语语料库

输入信息:查询词汇query,词汇左右两边限定词汇长度max_left, max_right

返回内容:无 (会在当前文件夹下创建 ./CCL_corpus/ 文件夹保存目标词汇的搜索结果)

图片说明:

3281589670804_.pic_hd3291589670915_.pic_hd

具体程序:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import re
import requests
import json
import os

def extract_sentences(query:str='bill', max_left:int=300, max_right:int=300):
"""

:param query: 目标搜索词 target word
:param max_left: 左侧提取最多字数(以char为单位) maximum str length before target word
:param max_right: 右侧提取最多字数(以char为单位)minmum str length after target word
:return: None. (Generate the file containing sentences)
"""

page = 1
while True:
web = 'http://ccl.pku.edu.cn:8080/ccl_corpus/search?dir=chen&q='+query+'&LastQuery='+query+'&start='+str((page-1)*50)+'&num=50'+\
'&index=FullIndex&outputFormat=HTML&orderStyle=score&encoding=UTF-8&neighborSortLength=0&maxLeftLength='+str(max_left)+'&maxRightLength='+str(max_right)+'&isForReading=yes'
response = requests.get(web)

if response.raise_for_status(): # check if the website is available
print(response.raise_for_status())

break

url_text = response.content.decode()

se = re.search(
r'<td width=\"3%\">(\d+)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><\/tr>',
url_text, re.S)

if not se: # If there is no matching pattern, break
break

match = re.finditer(r'<td width=\"3%\">(\d+)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><\/tr>',
url_text, re.S)

if not os.path.isdir('../CCL_corpus'):
os.mkdir('../CCL_corpus')

with open('../CCL_corpus/ccl_'+query+'.txt','a') as f:
for m in match:

f.write(m.group(1)+' '+ re.sub(' +', ' ', re.sub('\n', ' ', m.group(2)))+' '+
re.sub(' +', ' ', re.sub('\n', ' ', m.group(3)))+'\n')

page += 1

if __name__ == "__main__":

target_words_list = ['address', 'appreciate', 'beat', 'bill', 'bond', 'column', 'cover', 'deliver', 'exploit', 'figure','perspective', 'platform', 'provision', 'rest']
for t in target_words_list:
extract_sentences(t)