whatidid | 提取CCL双语语料检索库检索数据

提取CCL双语语料库信息【简易网页内容抓取】

说明：此程序非商用，纯属个人提取数据时写的简单的抓取网页信息的程序，具体内容请参阅 CCL语料库使用说明。感谢CCL语料库的创建和维护者。转载请标明出处

适用范围：CCL双语语料库

输入信息：查询词汇query，词汇左右两边限定词汇长度max_left, max_right

返回内容：无 (会在当前文件夹下创建 ./CCL_corpus/ 文件夹保存目标词汇的搜索结果)

图片说明：

具体程序：

import re
import requests
import json
import os

def extract_sentences(query:str='bill', max_left:int=300, max_right:int=300):
    """

    :param query: 目标搜索词 target word
    :param max_left: 左侧提取最多字数(以char为单位) maximum str length before target word
    :param max_right: 右侧提取最多字数(以char为单位)minmum str length after target word
    :return: None. (Generate the file containing sentences)
    """

    page = 1
    while True:
        web = 'http://ccl.pku.edu.cn:8080/ccl_corpus/search?dir=chen&q='+query+'&LastQuery='+query+'&start='+str((page-1)*50)+'&num=50'+\
              '&index=FullIndex&outputFormat=HTML&orderStyle=score&encoding=UTF-8&neighborSortLength=0&maxLeftLength='+str(max_left)+'&maxRightLength='+str(max_right)+'&isForReading=yes'
        response = requests.get(web)

        if response.raise_for_status(): # check if the website is available
            print(response.raise_for_status())

            break

        url_text = response.content.decode()

        se = re.search(
            r'<td width=\"3%\">(\d+)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><\/tr>',
            url_text, re.S)

        if not se: # If there is no matching pattern, break
            break

        match = re.finditer(r'<td width=\"3%\">(\d+)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><td width=\"45%\" valign=\"top\" colspan=\"3\" align=\"left\">(.+?)<\/td><\/tr>',
                                url_text, re.S)

        if not os.path.isdir('../CCL_corpus'):
            os.mkdir('../CCL_corpus')

        with open('../CCL_corpus/ccl_'+query+'.txt','a') as f:
            for m in match:

                f.write(m.group(1)+' '+ re.sub(' +', ' ', re.sub('\n', ' ', m.group(2)))+' '+
                        re.sub(' +', ' ', re.sub('\n', ' ', m.group(3)))+'\n')

        page += 1

if __name__ == "__main__":

    target_words_list = ['address', 'appreciate', 'beat', 'bill', 'bond', 'column', 'cover', 'deliver', 'exploit', 'figure','perspective', 'platform', 'provision', 'rest']
    for t in target_words_list:
        extract_sentences(t)