import pandas as pd import re pattern = re.compile('Category') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') for line in ls: if re.search(pattern, line): print (line)
re.searchがマッチングに失敗した場合にNoneを返却することを利用しています。
22. カテゴリ名の抽出
記事のカテゴリ名を(行単位ではなく名前で)抽出せよ.
1 2 3 4 5 6 7 8 9
import pandas as pd import re wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') for line in ls: if re.search(pattern, line): line = line.replace('[[','').replace('Category:','').replace(']]','').replace('|*','').replace('|元','') print (line)
正規表現で全てを記述せず、後処理としてreplaceで削除を行なっています。
23. セクション構造
記事中に含まれるセクション名とそのレベル(例えば”== セクション名 ==”なら1)を表示せよ.
1 2 3 4 5 6 7 8 9 10
import pandas as pd import re pattern = re.compile('^=+.*=+$') # 1回以上の=で始まり、1回以上の=で終わる文字列 wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') for line in ls: if re.search(pattern, line): level = line.count('=') // 2 - 1 print(line.replace('=',''), level )
import pandas as pd import re pattern = re.compile('File|ファイル:(.+?)\|') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') for line in ls: r = re.findall(pattern, line) if r: print (r[0])
import pandas as pd import re pattern = re.compile('\|(.+?)\s=\s*(.+)') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') d = {} for line in ls: r = re.search(pattern, line) if r: d[r[1]]=r[2] print (d)
import pandas as pd import re pattern = re.compile('\|(.+?)\s=\s*(.+)') p_emp = re.compile('\'{2,}(.+?)\'{2,}') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') d = {} for line in ls: r = re.search(pattern, line) if r: d[r[1]]=r[2] r = re.sub(p_emp,'\\1', line) print (r) print (d)
import pandas as pd import re pattern = re.compile('\|(.+?)\s=\s*(.+)') p_emp = re.compile('\'{2,}(.+?)\'{2,}') p_link = re.compile('\[\[(.+?)\]\]') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values lines = uk[0] lines = re.sub(p_emp,'\\1', lines) lines = re.sub(p_link,'\\1', lines) ls = lines.split('\n') d = {} for line in ls: r = re.search(pattern, line) if r: d[r[1]]=r[2] print (d)
import pandas as pd import re import requests pattern = re.compile('\|(.+?)\s=\s*(.+)') wiki = pd.read_json('jawiki-country.json.gz', lines = True) uk = wiki[wiki['title']=='イギリス'].text.values ls = uk[0].split('\n') d = {} for line in ls: r = re.search(pattern, line) if r: d[r[1]]=r[2] S = requests.Session() URL = "https://commons.wikimedia.org/w/api.php" PARAMS = { "action": "query", "format": "json", "titles": "File:" + d['国旗画像'], "prop": "imageinfo", "iiprop":"url" } R = S.get(url=URL, params=PARAMS) DATA = R.json() PAGES = DATA['query']['pages'] for k, v in PAGES.items(): print (v['imageinfo'][0]['url'])