From b32e38c2cd4a7ec5ae84a3fcf33d4fb5d6335b76 Mon Sep 17 00:00:00 2001 From: natsuapo Date: Wed, 18 May 2022 18:23:39 +0900 Subject: [PATCH 1/3] add using tag to find infobox --- requirements.txt | 1 + wptools/page.py | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index e223e05..85d7829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ certifi>=2017.7.27.1 html2text>=2016.9.19 lxml>=3.8.0 +beautifulsoup4 \ No newline at end of file diff --git a/wptools/page.py b/wptools/page.py index ea86cef..7ec2649 100644 --- a/wptools/page.py +++ b/wptools/page.py @@ -64,6 +64,10 @@ def __init__(self, *args, **kwargs): if boxterm: self.params.update({'boxterm': boxterm}) + boxtag = kwargs.get('boxtag') + if boxtag: + self.params.update({'boxtag':boxtag}) + endpoint = kwargs.get('endpoint') if endpoint: self.params.update({'endpoint': endpoint}) @@ -250,11 +254,15 @@ def _set_parse_data(self): parsetree = pdata.get('parsetree') self.data['parsetree'] = parsetree - boxterm = self.params.get('boxterm') - if boxterm: - infobox = utils.get_infobox(parsetree, boxterm) + boxtag = self.params.get('boxtag') + if boxtag: + infobox = utils.get_infobox_withtag(pdata['text'],parsetree,boxtag) else: - infobox = utils.get_infobox(parsetree) + boxterm = self.params.get('boxterm') + if boxterm: + infobox = utils.get_infobox(parsetree, boxterm) + else: + infobox = utils.get_infobox(parsetree) self.data['infobox'] = infobox title = pdata.get('title') From 904e04b173b64a08fe25de5d0fd8efc394383eaf Mon Sep 17 00:00:00 2001 From: natsuapo Date: Thu, 19 May 2022 15:03:21 +0900 Subject: [PATCH 2/3] add test for info with tag --- tests/test_advanced.py | 6 ++++++ tests/test_basic.py | 1 + wptools/utils.py | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 45 insertions(+) diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 86193c0..164bb69 100755 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -99,6 +99,12 @@ def test_not_found(self): except LookupError as detail: pass + def test_finding_infobox_with_tag(self) : + page = wptools.page("秋田県民会館", lang='ja', boxtag='infobox') + page.get_parse(show=False) + infobox = page.data['infobox'] + self.assertTrue(infobox is not None) + def test_lookup_unicode_error(self): """ Raise LookupError without UnicodeDecodeError. Issue #29 diff --git a/tests/test_basic.py b/tests/test_basic.py index 733a463..ca08636 100755 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -424,6 +424,7 @@ def test_page_get_parse_66(self): self.assertEqual(len(infobox['Genre'].split(' Date: Fri, 20 May 2022 11:54:12 +0900 Subject: [PATCH 3/3] enhance find infoboxes with tag --- wptools/page.py | 2 +- wptools/utils.py | 56 ++++++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/wptools/page.py b/wptools/page.py index 7ec2649..64175d0 100644 --- a/wptools/page.py +++ b/wptools/page.py @@ -256,7 +256,7 @@ def _set_parse_data(self): boxtag = self.params.get('boxtag') if boxtag: - infobox = utils.get_infobox_withtag(pdata['text'],parsetree,boxtag) + infobox = utils.get_infoboxes_withtag(pdata['text'], parsetree, boxtag) else: boxterm = self.params.get('boxterm') if boxterm: diff --git a/wptools/utils.py b/wptools/utils.py index d0c440d..fa59ed4 100644 --- a/wptools/utils.py +++ b/wptools/utils.py @@ -21,40 +21,54 @@ from bs4 import BeautifulSoup -def get_infobox_withtag(pagetxt, ptree, boxtag='infobox') : +def get_boxterm_with_boxtag(pagetxt,ptree,boxtag='infobox'): soup = BeautifulSoup(pagetxt) # here not ensure all infobox items are tables ptree_soup = BeautifulSoup(ptree) table_first_string = soup.find('table', class_=boxtag).find('th').text.strip() - boxes = [] - def order_keep_same(text1, text2) : - for i in text1 : - try : - text2 = text2[text2.index(i) + 1 :] - except : - return False - return True - # for ptree_soup.findAll('template') +def order_keep_same(text1, text2) : + for i in text1 : + try : + text2 = text2[text2.index(i) + 1 :] + except : + return False + return True + +def get_infoboxes_withtag(pagetxt, ptree, boxtag='infobox', multi=True) : + soup = BeautifulSoup(pagetxt) + # here not ensure all infobox items are tables, still some items cannot be acquires. + ptree_soup = BeautifulSoup(ptree) + + table_first_string = soup.find('table', class_=boxtag).find('th').text.strip() + + boxes = [] + + # this function will get all infoboxes, and match them to all templates, distinguish with box titles; for temp_item in ptree_soup.findAll('template') : - if order_keep_same(table_first_string, temp_item.find('value').text.strip()) : + try: + if order_keep_same(table_first_string, temp_item.text.strip()) : - title = temp_item.find('title').text - item = lxml.etree.fromstring(str(temp_item)) - box = template_to_dict(item) + title = temp_item.find('title').text + item = lxml.etree.fromstring(str(temp_item)) + box = template_to_dict(item) - if box : - return box + if box : + if not multi: + return box - alt = template_to_dict_alt(item, title) - if alt : - boxes.append(alt) + alt = template_to_dict_alt(item, title) + if alt : + boxes.append(alt) + + except: + pass - if boxes : - return {'boxes' : boxes, 'count' : len(boxes)} + if boxes : + return {'boxes' : boxes, 'count' : len(boxes)} def get_infobox(ptree, boxterm="box"):