From e8d445ace68b5c10a05589b8a6a4c762d42b3544 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 26 Jun 2024 17:26:26 -0400 Subject: [PATCH 01/16] add wiki retriever --- src/agentscope/service/web/wiki.py | 483 +++++++++++++++++++++++++++++ tests/wiki_test.py | 430 +++++++++++++++++++++++++ 2 files changed, 913 insertions(+) create mode 100644 src/agentscope/service/web/wiki.py create mode 100644 tests/wiki_test.py diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py new file mode 100644 index 000000000..24b6198b9 --- /dev/null +++ b/src/agentscope/service/web/wiki.py @@ -0,0 +1,483 @@ +"""Search contents from WikiPedia, including texts, categories, infotable, table,...""" +import requests +import json +from bs4 import BeautifulSoup +import re + +from agentscope.service.service_response import ( + ServiceResponse, + ServiceExecStatus, +) +from agentscope.utils.common import requests_get + +def get_category_members( + entity: str, + max_members: int=1000, + limit_per_request: int=500 + ) -> ServiceResponse: + """Function to retrieve category members from Wikipedia:Category pages + + Args: + entity (str): searching keywords + max_members (int): maximum number of members to output + limit_per_request (int): number of members retrieved per quest + + Returns: + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, and return `content` as a list of dicts. + Keys of each dict: + + "pageid": unique page ID for the member + + "ns": namespace for the member, indicating if the corresponding page is Article/User/... See https://en.wikipedia.org/wiki/Wikipedia:Namespace for details. + + "title": title of the member + + Example: + + .. code-block:: python + members = get_category_members("Machine_learning", max_members=10) + print(members) + + It returns contents: + + .. code-block:: python + { + 'status': , + 'content': [{'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'}, + {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'}, + {'pageid': 53587467, 'ns': 0, 'title': 'Outline of machine learning'}, + {'pageid': 64439717, 'ns': 0, 'title': '80 Million Tiny Images'}, + {'pageid': 75530149, 'ns': 0, 'title': 'Accelerated Linear Algebra'}] + + } + + """ + url = "https://en.wikipedia.org/w/api.php" + params = { + 'action': 'query', + 'list': 'categorymembers', + 'cmtitle': f'Category:{entity}', + 'cmlimit': limit_per_request, # Maximum number of results per request + 'format': 'json' + } + + members = [] + total_fetched = 0 + + while total_fetched < max_members: + data = requests_get(url, params=params) + batch_members = data['query']['categorymembers'] + members.extend(batch_members) + total_fetched += len(batch_members) + + # Check if there is a continuation token + if 'continue' in data and total_fetched < max_members: + params['cmcontinue'] = data['continue']['cmcontinue'] + else: + break + + # If more members were fetched than max_members, trim the list + if len(members) > max_members: + members = members[:max_members] + + if len(members) > 0: + return ServiceResponse(ServiceExecStatus.SUCCESS, members) + else: + return ServiceResponse(ServiceExecStatus.ERROR, members) + + +def get_infobox( + entity: str + ) -> ServiceResponse: + """ + Function to retrieve InfoBox from the WikiPedia page + + Args: + entity (str): searching keywords + + Returns: + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, and return `content` as a dict containing information in the InfoBox. + + Example: + + .. code-block:: python + infobox_data = get_infobox(entity="Python (programming language)") + print(infobox_data) + + It returns content: + + .. code-block:: python + { + 'status': , + 'content': {'Paradigm': 'Multi-paradigm : object-oriented , [1] procedural ( imperative ), functional , structured , reflective', + 'Designed\xa0by': 'Guido van Rossum', + 'Developer': 'Python Software Foundation', + 'First\xa0appeared': '20\xa0February 1991 ; 33 years ago ( 1991-02-20 ) [2]', + 'Stable release': '3.12.4 / 6 June 2024 ; 14 days ago ( 6 June 2024 )', + 'Typing discipline': 'duck , dynamic , strong ; [3] optional type annotations (since 3.5, but those hints are ignored, except with unofficial tools) [4]', + 'OS': 'Tier 1 : 64-bit Linux , macOS ; 64- and 32-bit Windows 10+ [5] Tier 2 : E.g. 32-bit WebAssembly (WASI) Tier 3 : 64-bit FreeBSD , iOS ; e.g. Raspberry Pi OS Unofficial (or has been known to work): Other Unix-like / BSD variants and e.g. Android 5.0+ (official from Python 3.13 planned [6] ) and a few other platforms [7] [8] [9]', + 'License': 'Python Software Foundation License', + 'Filename extensions': '.py, .pyw, .pyz, [10] .pyi, .pyc, .pyd', + 'Website': 'python.org'} + } + """ + + url = "https://en.wikipedia.org/w/api.php" + + # Step 1: Check if the entity exists + search_params = { + 'action': 'query', + 'list': 'search', + 'srsearch': entity, + 'format': 'json' + } + + search_data = requests_get(url, params=search_params) + + if 'query' in search_data and search_data['query']['search']: + # Check if the exact title exists + exact_match = None + for result in search_data['query']['search']: + if result['title'].lower() == entity.lower(): + exact_match = result['title'] + break + if not exact_match: + similar_entities = [result['title'] for result in search_data['query']['search'][:5]] + return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities:{similar_entities}") + + entity = exact_match + + # Step 2: Fetch the infobox content if the entity exists + parse_params = { + 'action': 'parse', + 'page': entity, + 'prop': 'text', + 'format': 'json' + } + + parse_data = requests_get(url, params=parse_params) + + if 'parse' in parse_data: + raw_html = parse_data['parse']['text']['*'] + soup = BeautifulSoup(raw_html, 'html.parser') + infobox = soup.find('table', {'class': 'infobox'}) + + if not infobox: + return ServiceResponse(ServiceExecStatus.ERROR, None) + + infobox_data = {} + for row in infobox.find_all('tr'): + header = row.find('th') + value = row.find('td') + if header and value: + key = header.get_text(" ", strip=True) + val = value.get_text(" ", strip=True) + infobox_data[key] = val + + return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data) + else: + error_message = parse_data.get('error', {}).get('info', 'Unknown error occurred') + return ServiceResponse(ServiceExecStatus.ERROR, {"error": error_message}) + else: + return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + + +def get_page_content_by_paragraph( + entity: str, + max_paragraphs: int=1 + ) -> ServiceResponse: + """ + Retrieve content from a Wikipedia page and split it into paragraphs, + excluding section headers. + + Args: + entity (str): search word. + max_paragraphs (int, optional): The maximum number of paragraphs to retrieve. Default is None (retrieve all paragraphs). + + Returns: + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, and return `content` as a list of paragraphs from the Wikipedia page. + + Example: + + .. code-block:: python + wiki_paragraph = get_page_content_by_paragraph(entity="Python (programming language)", max_paragraphs=1) + print(wiki_paragraph) + + It will return content: + .. code-block:: python + { + 'status': , + 'content': ['Python is a high-level, general-purpose programming...'] + } + + """ + url = "https://en.wikipedia.org/w/api.php" + + # Step 1: Check if the entity exists + search_params = { + 'action': 'query', + 'list': 'search', + 'srsearch': entity, + 'format': 'json' + } + + search_data = requests_get(url, params=search_params) + + if 'query' in search_data and search_data['query']['search']: + # Check if the exact title exists + exact_match = None + for result in search_data['query']['search']: + if result['title'].lower() == entity.lower(): + exact_match = result['title'] + break + if not exact_match: + similar_entities = [result['title'] for result in search_data['query']['search'][:5]] + return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities: {similar_entities}") + + entity = exact_match + + # Step 2: Fetch the page content if the entity exists + params = { + 'action': 'query', + 'prop': 'extracts', + 'explaintext': True, + 'titles': entity, + 'format': 'json' + } + + data = requests_get(url, params=params) + page = next(iter(data['query']['pages'].values())) + content = page.get('extract', 'No content found.') + if content == 'No content found.': + return ServiceResponse(ServiceExecStatus.ERROR, content) + + # Split content into paragraphs and filter out headers + paragraphs = [para.strip() for para in content.split('\n\n') if not re.match(r'^\s*==.*==\s*$', para) and para.strip() != ''] + + # Return the specified number of paragraphs + if max_paragraphs: + paragraphs = paragraphs[:max_paragraphs] + + return ServiceResponse(ServiceExecStatus.SUCCESS, paragraphs) + else: + return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + +def get_all_wikipedia_tables( + entity: str + ) -> ServiceResponse: + """ + Retrieve tables on the Wikipedia page + + Args: + entity (str): search word. + + Returns: + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, and return `content` as a list of tables from the Wikipedia page. + Each table is presented as a dict, where key is the column name and value is the values for each column. + + Example: + + .. code-block:: python + wiki_table = get_all_wikipedia_tables(entity="Python (programming language)") + print(wiki_table) + + It will return content: + .. code-block:: python + { + 'status': , + 'content': [ + { + 'Type': ['bool','bytearray','bytes','complex',...], + 'Mutability': ['immutable','mutable','immutable','immutable',...], + ... + } + ] + } + + """ + url = "https://en.wikipedia.org/w/api.php" + + # Step 1: Check if the entity exists + search_params = { + 'action': 'query', + 'list': 'search', + 'srsearch': entity, + 'format': 'json' + } + + search_response = requests_get(url, params=search_params) + search_data = search_response + + if 'query' in search_data and search_data['query']['search']: + # Check if the exact title exists + exact_match = None + for result in search_data['query']['search']: + if result['title'].lower() == entity.lower(): + exact_match = result['title'] + break + if not exact_match: + similar_entities = [result['title'] for result in search_data['query']['search'][:5]] + return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities:{similar_entities}") + + entity = exact_match + + # Step 2: Fetch the page content if the entity exists + params = { + 'action': 'parse', + 'page': entity, + 'prop': 'text', + 'format': 'json' + } + + data = requests_get(url, params=params) + raw_html = data['parse']['text']['*'] + + soup = BeautifulSoup(raw_html, 'html.parser') + tables = soup.find_all('table', {'class': 'wikitable'}) + + if not tables: + return ServiceResponse(ServiceExecStatus.ERROR, None) + + all_tables_data = [] + for table_index, table in enumerate(tables): + headers = [header.get_text(strip=True) for header in table.find_all('th')] + table_dict = {header: [] for header in headers} + + for row in table.find_all('tr')[1:]: # Skip the header row + cells = row.find_all(['td', 'th']) + if len(cells) == len(headers): # Ensure the row has the correct number of cells + for i, cell in enumerate(cells): + table_dict[headers[i]].append(cell.get_text(strip=True)) + + all_tables_data.append(table_dict) + + return ServiceResponse(ServiceExecStatus.SUCCESS, all_tables_data) + else: + return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + + +def get_page_images_with_captions( + entity: str + ) -> ServiceResponse: + """ + Function to retrive images and details on the Wikipedia page + + Args: + entity (str): search word. + + Returns: + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS and return the `content` as a list of dict from the Wikipedia page. + + Each dict has: + 'title': title of the image + 'url': link to the image + 'caption': caption of the image + + Example: + .. code-block:: python + wiki_images = get_page_images_with_captions(entity="Python (programming language)") + print(wiki_images) + + It will return: + + .. code-block:: python + { + 'status': , + 'content': [{ + 'title': 'File:Commons-logo.svg', + 'url': 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg', + 'caption': 'The Wikimedia Commons logo, SVG version.'}, + ... + ] + } + """ + + url = "https://en.wikipedia.org/w/api.php" + + # Step 1: Check if the entity exists + search_params = { + 'action': 'query', + 'list': 'search', + 'srsearch': entity, + 'format': 'json' + } + + search_response = requests_get(url, params=search_params) + search_data = search_response + + if 'query' in search_data and search_data['query']['search']: + # Check if the exact title exists + exact_match = None + for result in search_data['query']['search']: + if result['title'].lower() == entity.lower(): + exact_match = result['title'] + break + if not exact_match: + similar_entities = [result['title'] for result in search_data['query']['search'][:5]] + return ServiceResponse(ServiceExecStatus.ERROR, {"similar_entities": similar_entities}) + + entity = exact_match + + # Step 2: Get the list of images + params = { + 'action': 'query', + 'prop': 'images', + 'titles': entity, + 'format': 'json' + } + data = requests_get(url, params=params) + page = next(iter(data['query']['pages'].values())) + images = page.get('images', []) + if len(images) == 0: + return ServiceResponse(ServiceExecStatus.ERROR, None) + + # Step 3: Get details for each image + image_details = [] + for image in images: + image_title = image['title'] + params = { + 'action': 'query', + 'titles': image_title, + 'prop': 'imageinfo', + 'iiprop': 'url|extmetadata', + 'format': 'json' + } + response = requests.get(url, params=params) + data = response.json() + image_page = next(iter(data['query']['pages'].values())) + if 'imageinfo' in image_page: + image_info = image_page['imageinfo'][0] + image_url = image_info.get('url', '') + extmetadata = image_info.get('extmetadata', {}) + caption = extmetadata.get('ImageDescription', {}).get('value', 'No caption available') + image_details.append({ + 'title': image_title, + 'url': image_url, + 'caption': caption + }) + + return ServiceResponse(ServiceExecStatus.SUCCESS, image_details) + else: + return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + + diff --git a/tests/wiki_test.py b/tests/wiki_test.py new file mode 100644 index 000000000..9481d45c2 --- /dev/null +++ b/tests/wiki_test.py @@ -0,0 +1,430 @@ +"""Wiki retriever test.""" +import unittest +from unittest.mock import Mock, patch, MagicMock + +from agentscope.service import ServiceResponse +from agentscope.service import get_category_members, get_infobox, get_page_content_by_paragraph, get_all_wikipedia_tables, get_page_images_with_captions +from agentscope.service.service_status import ServiceExecStatus + +class TestWiki(unittest.TestCase): + """ExampleTest for a unit test.""" + + @patch("agentscope.utils.common.requests.get") + def test_get_category_members(self, mock_get: MagicMock) -> None: + """Test test_get_category_members""" + mock_response = Mock() + mock_dict = { + 'query': { + 'categorymembers': [{ + 'pageid': 20, + 'ns': 0, + 'title': 'This is a test' + }] + } + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=[ + {'pageid': 20, + 'ns': 0, + 'title': 'This is a test' + } + ] + ) + + mock_response.json.return_value = mock_dict + mock_get.return_value = mock_response + + test_entity = "Test" + max_members=1 + limit_per_request=100 + params = { + 'action': 'query', + 'list': 'categorymembers', + 'cmtitle': f'Category:{test_entity}', + 'cmlimit': limit_per_request, # Maximum number of results per request + 'format': 'json' + } + + results = get_category_members( + entity=test_entity, + max_members=max_members, + limit_per_request=limit_per_request + ) + mock_get.assert_called_once_with( + "https://en.wikipedia.org/w/api.php", + params=params, + ) + + self.assertEqual( + results, + expected_result, + ) + + @patch("agentscope.utils.common.requests.get") + def test_get_infobox(self, mock_get: MagicMock) -> None: + """Test get_infobox with different parameters and responses""" + + # Mock responses for search query + mock_response_search = Mock() + mock_dict_search = { + 'query': { + 'search': [ + {'title': 'Test'} + ] + } + } + + # Mock responses for parse query + mock_response_parse = Mock() + mock_dict_parse = { + 'parse': { + 'title': 'Test', + 'pageid': 20, + 'text': { '*':""" + + + + + + + + + +
Column1Data1
Column2Data2
+ """ + } + } + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content={ + 'Column1': 'Data1', + 'Column2': 'Data2' + } + ) + + # Set the side effect of the mock_get to return different responses in sequence + mock_response_search.json.return_value = mock_dict_search + mock_response_parse.json.return_value = mock_dict_parse + mock_get.side_effect = [mock_response_search, mock_response_parse] + + test_entity = "Test" + + results = get_infobox(entity=test_entity) + + # Define expected calls + calls = [ + unittest.mock.call("https://en.wikipedia.org/w/api.php", params={ + 'action': 'query', + 'list': 'search', + 'srsearch': test_entity, + 'format': 'json' + }), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params={ + 'action': 'parse', + 'page': test_entity, + 'prop': 'text', + 'format': 'json' + }) + ] + + mock_get.assert_has_calls(calls, any_order=True) + + self.assertEqual(results, expected_result) + + + @patch("agentscope.utils.common.requests.get") + def test_get_page_content_by_paragraph(self, mock_get: MagicMock) -> None: + """Test get_page_content_by_paragraph with different parameters and responses""" + + # Mock responses for search query + mock_response_search = Mock() + mock_dict_search = { + 'query': { + 'search': [ + {'title': 'Test'} + ] + } + } + + # Mock responses for extract query + mock_response_extract = Mock() + mock_dict_extract = { + 'query': { + 'pages': { + '20': { + 'pageid': 20, + 'title': 'Test', + 'extract': """ + This is the first paragraph. + + This is the second paragraph. + + == Section Header == + + This is the third paragraph under a section header. + """ + } + } + } + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=[ + "This is the first paragraph.", + "This is the second paragraph." + ] + ) + + # Set the side effect of the mock_get to return different responses in sequence + mock_response_search.json.return_value = mock_dict_search + mock_response_extract.json.return_value = mock_dict_extract + mock_get.side_effect = [mock_response_search, mock_response_extract] + + test_entity = "Test" + + results = get_page_content_by_paragraph(entity=test_entity, max_paragraphs=2) + + # Define expected calls + params1 = { + 'action': 'query', + 'list': 'search', + 'srsearch': test_entity, + 'format': 'json' + } + params2 = { + 'action': 'query', + 'prop': 'extracts', + 'explaintext': True, + 'titles': test_entity, + 'format': 'json' + } + + calls = [ + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2) + ] + + mock_get.assert_has_calls(calls, any_order=True) + + self.assertEqual(results, expected_result) + + @patch("agentscope.utils.common.requests.get") + def test_get_all_wikipedia_tables(self, mock_get: MagicMock) -> None: + """Test get_all_wikipedia_tables with different parameters and responses""" + + # Mock responses for search query + mock_response_search = Mock() + mock_dict_search = { + 'query': { + 'search': [ + {'title': 'Test'} + ] + } + } + + # Mock responses for parse query + mock_response_parse = Mock() + mock_dict_parse = { + 'parse': { + 'title': 'Test', + 'pageid': 20, + 'text': { '*':""" + + + + + + + + + + + + + +
Header1Header2
Row1Col1Row1Col2
Row2Col1Row2Col2
+ """ + } + } + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=[{ + 'Header1': ['Row1Col1', 'Row2Col1'], + 'Header2': ['Row1Col2', 'Row2Col2'] + }] + ) + + # Set the side effect of the mock_get to return different responses in sequence + mock_response_search.json.return_value = mock_dict_search + mock_response_parse.json.return_value = mock_dict_parse + mock_get.side_effect = [mock_response_search, mock_response_parse] + + test_entity = "Test" + + results = get_all_wikipedia_tables(entity=test_entity) + + # Define expected calls + params1 = { + 'action': 'query', + 'list': 'search', + 'srsearch': test_entity, + 'format': 'json' + } + params2 = { + 'action': 'parse', + 'page': test_entity, + 'prop': 'text', + 'format': 'json' + } + + calls = [ + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2) + ] + + mock_get.assert_has_calls(calls, any_order=True) + + self.assertEqual(results, expected_result) + + + @patch("agentscope.utils.common.requests.get") + def test_get_page_images_with_captions(self, mock_get: MagicMock) -> None: + """Test get_page_images_with_captions with different parameters and responses""" + + # Mock responses for search query + mock_response_search = Mock() + mock_dict_search = { + 'query': { + 'search': [ + {'title': 'Test'} + ] + } + } + + # Mock responses for images query + mock_response_images = Mock() + mock_dict_images = { + 'query': { + 'pages': { + '20': { + 'pageid': 20, + 'title': 'Test', + 'images': [ + {'title': 'Image1'}, + {'title': 'Image2'} + ] + } + } + } + } + + # Mock responses for image details query + mock_response_image1 = Mock() + mock_dict_image1 = { + 'query': { + 'pages': { + '30': { + 'pageid': 30, + 'imageinfo': [{ + 'url': 'http://example.com/image1.jpg', + 'extmetadata': { + 'ImageDescription': {'value': 'Caption for image 1'} + } + }] + } + } + } + } + + mock_response_image2 = Mock() + mock_dict_image2 = { + 'query': { + 'pages': { + '31': { + 'pageid': 31, + 'imageinfo': [{ + 'url': 'http://example.com/image2.jpg', + 'extmetadata': { + 'ImageDescription': {'value': 'Caption for image 2'} + } + }] + } + } + } + } + + expected_result = ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=[ + { + 'title': 'Image1', + 'url': 'http://example.com/image1.jpg', + 'caption': 'Caption for image 1' + }, + { + 'title': 'Image2', + 'url': 'http://example.com/image2.jpg', + 'caption': 'Caption for image 2' + } + ] + ) + + # Set the side effect of the mock_get to return different responses in sequence + mock_response_search.json.return_value = mock_dict_search + mock_response_images.json.return_value = mock_dict_images + mock_response_image1.json.return_value = mock_dict_image1 + mock_response_image2.json.return_value = mock_dict_image2 + mock_get.side_effect = [mock_response_search, mock_response_images, mock_response_image1, mock_response_image2] + + test_entity = "Test" + + results = get_page_images_with_captions(entity=test_entity) + + # Define expected calls + params1 = { + 'action': 'query', + 'list': 'search', + 'srsearch': test_entity, + 'format': 'json' + } + params2 = { + 'action': 'query', + 'prop': 'images', + 'titles': test_entity, + 'format': 'json' + } + params3_image1 = { + 'action': 'query', + 'titles': 'Image1', + 'prop': 'imageinfo', + 'iiprop': 'url|extmetadata', + 'format': 'json' + } + params4_image2 = { + 'action': 'query', + 'titles': 'Image2', + 'prop': 'imageinfo', + 'iiprop': 'url|extmetadata', + 'format': 'json' + } + + calls = [ + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params3_image1), + unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params4_image2) + ] + + mock_get.assert_has_calls(calls, any_order=True) + + self.assertEqual(results, expected_result) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From fad1ba6994067eb338cbff79336a0175534f6edb Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 26 Jun 2024 17:34:12 -0400 Subject: [PATCH 02/16] modify readme --- README.md | 1 + README_ZH.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 25efd9909..337a9e950 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ the following libraries. - File Operation - Text Processing - Multi Modality +- Wikipedia search and retrieval **Example Applications** diff --git a/README_ZH.md b/README_ZH.md index 1c13b1e55..4575c3f33 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -120,6 +120,7 @@ AgentScope支持使用以下库快速部署本地模型服务。 - 文件操作 - 文本处理 - 多模态生成 +- 维基百科搜索 **样例应用** From 4150fb97b8b971b7e696e1dc4c34b781257acb57 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 26 Jun 2024 17:47:42 -0400 Subject: [PATCH 03/16] modify --- src/agentscope/service/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index 3561734f8..b47627daf 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -86,6 +86,11 @@ def get_help() -> None: "dashscope_image_to_text", "dashscope_text_to_image", "dashscope_text_to_audio", + "get_category_members", + "get_infobox", + "get_page_content_by_paragraph", + "get_all_wikipedia_tables", + "get_page_images_with_captions", # to be deprecated "ServiceFactory", ] From 77d04d523f6ca2bc8ee61804d56de3745929df4a Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 26 Jun 2024 19:10:41 -0400 Subject: [PATCH 04/16] modify --- src/agentscope/service/__init__.py | 18 +- src/agentscope/service/web/wiki.py | 630 ++++++++++++++--------------- tests/wiki_test.py | 519 ++++++++++++++---------- 3 files changed, 618 insertions(+), 549 deletions(-) diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index b47627daf..0de13f91d 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -41,6 +41,14 @@ from .web.web_digest import digest_webpage, load_web, parse_html_to_text from .web.download import download_from_url +from .web.wiki import ( + wiki_get_category_members, + wiki_get_infobox, + wiki_get_page_content_by_paragraph, + wiki_get_all_wikipedia_tables, + wiki_get_page_images_with_captions, +) + def get_help() -> None: """Get help message.""" @@ -86,11 +94,11 @@ def get_help() -> None: "dashscope_image_to_text", "dashscope_text_to_image", "dashscope_text_to_audio", - "get_category_members", - "get_infobox", - "get_page_content_by_paragraph", - "get_all_wikipedia_tables", - "get_page_images_with_captions", + "wiki_get_category_members", + "wiki_get_infobox", + "wiki_get_page_content_by_paragraph", + "wiki_get_all_wikipedia_tables", + "wiki_get_page_images_with_captions", # to be deprecated "ServiceFactory", ] diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index 24b6198b9..a138c36c4 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -1,8 +1,12 @@ -"""Search contents from WikiPedia, including texts, categories, infotable, table,...""" +# -*- coding: utf-8 -*- +""" +Search contents from WikiPedia, +including texts, categories, infotable, table,... +""" + +import re import requests -import json from bs4 import BeautifulSoup -import re from agentscope.service.service_response import ( ServiceResponse, @@ -10,59 +14,102 @@ ) from agentscope.utils.common import requests_get -def get_category_members( - entity: str, - max_members: int=1000, - limit_per_request: int=500 - ) -> ServiceResponse: + +def _check_entity_existence(entity: str) -> ServiceResponse: + url = "https://en.wikipedia.org/w/api.php" + search_params = { + "action": "query", + "list": "search", + "srsearch": entity, + "format": "json", + } + + search_data = requests_get(url, params=search_params) + + if "query" in search_data and search_data["query"]["search"]: + exact_match = None + for result in search_data["query"]["search"]: + if result["title"].lower() == entity.lower(): + exact_match = result["title"] + break + if not exact_match: + similar_entities = [ + result["title"] + for result in search_data["query"]["search"][:5] + ] + return ServiceResponse( + ServiceExecStatus.ERROR, + {"similar_entities": similar_entities}, + ) + return ServiceResponse( + ServiceExecStatus.SUCCESS, + {"entity": exact_match}, + ) + else: + return ServiceResponse( + ServiceExecStatus.ERROR, + {"error": "Entity not found"}, + ) + + +def wiki_get_category_members( + entity: str, + max_members: int = 1000, + limit_per_request: int = 500, +) -> ServiceResponse: """Function to retrieve category members from Wikipedia:Category pages - + Args: - entity (str): searching keywords + entity (str): searching keywords max_members (int): maximum number of members to output limit_per_request (int): number of members retrieved per quest - + Returns: `ServiceResponse`: A dictionary containing `status` and `content`. The `status` attribute is from the ServiceExecStatus enum, indicating success or error. - If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, and return `content` as a list of dicts. + If the entity does not exist, `status`=ERROR + and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, + and return `content` as a list of dicts. Keys of each dict: - + "pageid": unique page ID for the member - - "ns": namespace for the member, indicating if the corresponding page is Article/User/... See https://en.wikipedia.org/wiki/Wikipedia:Namespace for details. - + + "ns": namespace for the member, + indicating if the corresponding page is Article/User/... + "title": title of the member - + Example: - + .. code-block:: python - members = get_category_members("Machine_learning", max_members=10) + members = wiki_get_category_members( + "Machine_learning", + max_members=10 + ) print(members) - + It returns contents: .. code-block:: python { 'status': , - 'content': [{'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'}, - {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'}, - {'pageid': 53587467, 'ns': 0, 'title': 'Outline of machine learning'}, - {'pageid': 64439717, 'ns': 0, 'title': '80 Million Tiny Images'}, - {'pageid': 75530149, 'ns': 0, 'title': 'Accelerated Linear Algebra'}] - + 'content': [ + {'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'}, + {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'}, + ...] + } - + """ url = "https://en.wikipedia.org/w/api.php" params = { - 'action': 'query', - 'list': 'categorymembers', - 'cmtitle': f'Category:{entity}', - 'cmlimit': limit_per_request, # Maximum number of results per request - 'format': 'json' + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{entity}", + "cmlimit": limit_per_request, # Maximum number of results per request + "format": "json", } members = [] @@ -70,414 +117,357 @@ def get_category_members( while total_fetched < max_members: data = requests_get(url, params=params) - batch_members = data['query']['categorymembers'] + batch_members = data["query"]["categorymembers"] members.extend(batch_members) total_fetched += len(batch_members) # Check if there is a continuation token - if 'continue' in data and total_fetched < max_members: - params['cmcontinue'] = data['continue']['cmcontinue'] + if "continue" in data and total_fetched < max_members: + params["cmcontinue"] = data["continue"]["cmcontinue"] else: break - + # If more members were fetched than max_members, trim the list if len(members) > max_members: members = members[:max_members] - + if len(members) > 0: return ServiceResponse(ServiceExecStatus.SUCCESS, members) else: return ServiceResponse(ServiceExecStatus.ERROR, members) - -def get_infobox( - entity: str - ) -> ServiceResponse: + +def wiki_get_infobox( + entity: str, +) -> ServiceResponse: """ Function to retrieve InfoBox from the WikiPedia page - + Args: - entity (str): searching keywords - + entity (str): searching keywords + Returns: `ServiceResponse`: A dictionary containing `status` and `content`. The `status` attribute is from the ServiceExecStatus enum, indicating success or error. - If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, and return `content` as a dict containing information in the InfoBox. - + If the entity does not exist, `status`=ERROR, + and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, + and return `content` as a dict containing information in the InfoBox. + Example: - + .. code-block:: python - infobox_data = get_infobox(entity="Python (programming language)") + infobox_data = wiki_get_infobox(entity="Python (programming language)") print(infobox_data) - + It returns content: - + .. code-block:: python { 'status': , - 'content': {'Paradigm': 'Multi-paradigm : object-oriented , [1] procedural ( imperative ), functional , structured , reflective', - 'Designed\xa0by': 'Guido van Rossum', - 'Developer': 'Python Software Foundation', - 'First\xa0appeared': '20\xa0February 1991 ; 33 years ago ( 1991-02-20 ) [2]', - 'Stable release': '3.12.4 / 6 June 2024 ; 14 days ago ( 6 June 2024 )', - 'Typing discipline': 'duck , dynamic , strong ; [3] optional type annotations (since 3.5, but those hints are ignored, except with unofficial tools) [4]', - 'OS': 'Tier 1 : 64-bit Linux , macOS ; 64- and 32-bit Windows 10+ [5] Tier 2 : E.g. 32-bit WebAssembly (WASI) Tier 3 : 64-bit FreeBSD , iOS ; e.g. Raspberry Pi OS Unofficial (or has been known to work): Other Unix-like / BSD variants and e.g. Android 5.0+ (official from Python 3.13 planned [6] ) and a few other platforms [7] [8] [9]', - 'License': 'Python Software Foundation License', - 'Filename extensions': '.py, .pyw, .pyz, [10] .pyi, .pyc, .pyd', + 'content': {'Paradigm': 'Multi-paradigm : object-oriented ...', + 'Designed\xa0by': 'Guido van Rossum', + 'Developer': 'Python Software Foundation', + 'First\xa0appeared': '20\xa0February 1991 ...', + 'Stable release': '3.12.4 / 6 June 2024 ; ...', + 'Typing discipline': 'duck , dynamic , strong ; ...', + 'OS': 'Tier 1 : 64-bit Linux , macOS ; 。。。', + 'License': 'Python Software Foundation License', + 'Filename extensions': '.py, .pyw, .pyz, [10] .pyi, ...', 'Website': 'python.org'} } """ - + + existence_response = _check_entity_existence(entity) + if existence_response.status == ServiceExecStatus.ERROR: + return existence_response + url = "https://en.wikipedia.org/w/api.php" - - # Step 1: Check if the entity exists - search_params = { - 'action': 'query', - 'list': 'search', - 'srsearch': entity, - 'format': 'json' + parse_params = { + "action": "parse", + "page": entity, + "prop": "text", + "format": "json", } - - search_data = requests_get(url, params=search_params) - - if 'query' in search_data and search_data['query']['search']: - # Check if the exact title exists - exact_match = None - for result in search_data['query']['search']: - if result['title'].lower() == entity.lower(): - exact_match = result['title'] - break - if not exact_match: - similar_entities = [result['title'] for result in search_data['query']['search'][:5]] - return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities:{similar_entities}") - entity = exact_match + parse_data = requests_get(url, params=parse_params) - # Step 2: Fetch the infobox content if the entity exists - parse_params = { - 'action': 'parse', - 'page': entity, - 'prop': 'text', - 'format': 'json' - } - - parse_data = requests_get(url, params=parse_params) - - if 'parse' in parse_data: - raw_html = parse_data['parse']['text']['*'] - soup = BeautifulSoup(raw_html, 'html.parser') - infobox = soup.find('table', {'class': 'infobox'}) - - if not infobox: - return ServiceResponse(ServiceExecStatus.ERROR, None) - - infobox_data = {} - for row in infobox.find_all('tr'): - header = row.find('th') - value = row.find('td') - if header and value: - key = header.get_text(" ", strip=True) - val = value.get_text(" ", strip=True) - infobox_data[key] = val - - return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data) - else: - error_message = parse_data.get('error', {}).get('info', 'Unknown error occurred') - return ServiceResponse(ServiceExecStatus.ERROR, {"error": error_message}) - else: - return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + if "parse" in parse_data: + raw_html = parse_data["parse"]["text"]["*"] + soup = BeautifulSoup(raw_html, "html.parser") + infobox = soup.find("table", {"class": "infobox"}) + if not infobox: + return ServiceResponse(ServiceExecStatus.ERROR, None) -def get_page_content_by_paragraph( - entity: str, - max_paragraphs: int=1 - ) -> ServiceResponse: + infobox_data = {} + for row in infobox.find_all("tr"): + header = row.find("th") + value = row.find("td") + if header and value: + key = header.get_text(" ", strip=True) + val = value.get_text(" ", strip=True) + infobox_data[key] = val + + return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data) + else: + error_message = parse_data.get("error", {}).get( + "info", + "Unknown error occurred", + ) + return ServiceResponse( + ServiceExecStatus.ERROR, + {"error": error_message}, + ) + + +def wiki_get_page_content_by_paragraph( + entity: str, + max_paragraphs: int = 1, +) -> ServiceResponse: """ Retrieve content from a Wikipedia page and split it into paragraphs, excluding section headers. Args: entity (str): search word. - max_paragraphs (int, optional): The maximum number of paragraphs to retrieve. Default is None (retrieve all paragraphs). + max_paragraphs (int, optional): + The maximum number of paragraphs to retrieve. + Default is None (retrieve all paragraphs). Returns: `ServiceResponse`: A dictionary containing `status` and `content`. The `status` attribute is from the ServiceExecStatus enum, indicating success or error. - If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, and return `content` as a list of paragraphs from the Wikipedia page. - + If the entity does not exist, `status`=ERROR, + and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, + and return `content` as a list of paragraphs from the Wikipedia page. + Example: - + .. code-block:: python - wiki_paragraph = get_page_content_by_paragraph(entity="Python (programming language)", max_paragraphs=1) + wiki_paragraph = wiki_get_page_content_by_paragraph( + entity="Python (programming language)", + max_paragraphs=1) print(wiki_paragraph) - + It will return content: .. code-block:: python { - 'status': , - 'content': ['Python is a high-level, general-purpose programming...'] + 'status': , + 'content': ['Python is a high-level...'] } """ + existence_response = _check_entity_existence(entity) + if existence_response.status == ServiceExecStatus.ERROR: + return existence_response + url = "https://en.wikipedia.org/w/api.php" - - # Step 1: Check if the entity exists - search_params = { - 'action': 'query', - 'list': 'search', - 'srsearch': entity, - 'format': 'json' + params = { + "action": "query", + "prop": "extracts", + "explaintext": True, + "titles": entity, + "format": "json", } - - search_data = requests_get(url, params=search_params) - - if 'query' in search_data and search_data['query']['search']: - # Check if the exact title exists - exact_match = None - for result in search_data['query']['search']: - if result['title'].lower() == entity.lower(): - exact_match = result['title'] - break - if not exact_match: - similar_entities = [result['title'] for result in search_data['query']['search'][:5]] - return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities: {similar_entities}") - entity = exact_match + data = requests_get(url, params=params) + page = next(iter(data["query"]["pages"].values())) + content = page.get("extract", "No content found.") + if content == "No content found.": + return ServiceResponse(ServiceExecStatus.ERROR, content) - # Step 2: Fetch the page content if the entity exists - params = { - 'action': 'query', - 'prop': 'extracts', - 'explaintext': True, - 'titles': entity, - 'format': 'json' - } + # Split content into paragraphs and filter out headers + paragraphs = [ + para.strip() + for para in content.split("\n\n") + if not re.match(r"^\s*==.*==\s*$", para) and para.strip() != "" + ] + + # Return the specified number of paragraphs + if max_paragraphs: + paragraphs = paragraphs[:max_paragraphs] + + return ServiceResponse(ServiceExecStatus.SUCCESS, paragraphs) - data = requests_get(url, params=params) - page = next(iter(data['query']['pages'].values())) - content = page.get('extract', 'No content found.') - if content == 'No content found.': - return ServiceResponse(ServiceExecStatus.ERROR, content) - - # Split content into paragraphs and filter out headers - paragraphs = [para.strip() for para in content.split('\n\n') if not re.match(r'^\s*==.*==\s*$', para) and para.strip() != ''] - - # Return the specified number of paragraphs - if max_paragraphs: - paragraphs = paragraphs[:max_paragraphs] - - return ServiceResponse(ServiceExecStatus.SUCCESS, paragraphs) - else: - return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) -def get_all_wikipedia_tables( - entity: str - ) -> ServiceResponse: +def wiki_get_all_wikipedia_tables( + entity: str, +) -> ServiceResponse: """ Retrieve tables on the Wikipedia page - + Args: entity (str): search word. - + Returns: `ServiceResponse`: A dictionary containing `status` and `content`. The `status` attribute is from the ServiceExecStatus enum, indicating success or error. - If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, and return `content` as a list of tables from the Wikipedia page. - Each table is presented as a dict, where key is the column name and value is the values for each column. - + If the entity does not exist, `status`=ERROR, + and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, + and return `content` as a list of tables from the Wikipedia page. + Each table is presented as a dict, + where key is the column name and value is the values for each column. + Example: - + .. code-block:: python - wiki_table = get_all_wikipedia_tables(entity="Python (programming language)") + wiki_table = wiki_get_all_wikipedia_tables( + entity="Python (programming language)" + ) print(wiki_table) - + It will return content: .. code-block:: python { 'status': , 'content': [ { - 'Type': ['bool','bytearray','bytes','complex',...], - 'Mutability': ['immutable','mutable','immutable','immutable',...], + 'Type': ['bool','bytearray',...], + 'Mutability': ['immutable','mutable',...], ... } ] } """ + existence_response = _check_entity_existence(entity) + if existence_response.status == ServiceExecStatus.ERROR: + return existence_response + url = "https://en.wikipedia.org/w/api.php" - - # Step 1: Check if the entity exists - search_params = { - 'action': 'query', - 'list': 'search', - 'srsearch': entity, - 'format': 'json' + params = { + "action": "parse", + "page": entity, + "prop": "text", + "format": "json", } - - search_response = requests_get(url, params=search_params) - search_data = search_response - - if 'query' in search_data and search_data['query']['search']: - # Check if the exact title exists - exact_match = None - for result in search_data['query']['search']: - if result['title'].lower() == entity.lower(): - exact_match = result['title'] - break - if not exact_match: - similar_entities = [result['title'] for result in search_data['query']['search'][:5]] - return ServiceResponse(ServiceExecStatus.ERROR, f"Entity not found. Here are similar entities:{similar_entities}") - entity = exact_match + data = requests_get(url, params=params) + raw_html = data["parse"]["text"]["*"] - # Step 2: Fetch the page content if the entity exists - params = { - 'action': 'parse', - 'page': entity, - 'prop': 'text', - 'format': 'json' - } + soup = BeautifulSoup(raw_html, "html.parser") + tables = soup.find_all("table", {"class": "wikitable"}) - data = requests_get(url, params=params) - raw_html = data['parse']['text']['*'] + if not tables: + return ServiceResponse(ServiceExecStatus.ERROR, None) - soup = BeautifulSoup(raw_html, 'html.parser') - tables = soup.find_all('table', {'class': 'wikitable'}) - - if not tables: - return ServiceResponse(ServiceExecStatus.ERROR, None) + all_tables_data = [] + for table_index, table in enumerate(tables): + headers = [ + header.get_text(strip=True) for header in table.find_all("th") + ] + table_dict = {header: [] for header in headers} - all_tables_data = [] - for table_index, table in enumerate(tables): - headers = [header.get_text(strip=True) for header in table.find_all('th')] - table_dict = {header: [] for header in headers} + for row in table.find_all("tr")[1:]: # Skip the header row + cells = row.find_all(["td", "th"]) + if len(cells) == len( + headers, + ): # Ensure the row has the correct number of cells + for i, cell in enumerate(cells): + table_dict[headers[i]].append( + cell.get_text(strip=True), + ) - for row in table.find_all('tr')[1:]: # Skip the header row - cells = row.find_all(['td', 'th']) - if len(cells) == len(headers): # Ensure the row has the correct number of cells - for i, cell in enumerate(cells): - table_dict[headers[i]].append(cell.get_text(strip=True)) - - all_tables_data.append(table_dict) + all_tables_data.append(table_dict) - return ServiceResponse(ServiceExecStatus.SUCCESS, all_tables_data) - else: - return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) + return ServiceResponse(ServiceExecStatus.SUCCESS, all_tables_data) -def get_page_images_with_captions( - entity: str - ) -> ServiceResponse: +def wiki_get_page_images_with_captions( + entity: str, +) -> ServiceResponse: """ Function to retrive images and details on the Wikipedia page - + Args: entity (str): search word. - + Returns: `ServiceResponse`: A dictionary containing `status` and `content`. The `status` attribute is from the ServiceExecStatus enum, indicating success or error. - If the entity does not exist, `status`=ERROR and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS and return the `content` as a list of dict from the Wikipedia page. - + If the entity does not exist, `status`=ERROR, + and return top-5 similar entities in `content`. + If the entity exists, `status`=SUCCESS, + and return the `content` as a list of dict from the Wikipedia page. + Each dict has: 'title': title of the image 'url': link to the image 'caption': caption of the image - + Example: .. code-block:: python - wiki_images = get_page_images_with_captions(entity="Python (programming language)") + wiki_images = wiki_get_page_images_with_captions( + entity="Python (programming language)" + ) print(wiki_images) - + It will return: - + .. code-block:: python { 'status': , 'content': [{ - 'title': 'File:Commons-logo.svg', - 'url': 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg', - 'caption': 'The Wikimedia Commons logo, SVG version.'}, - ... + 'title': 'File:Commons-logo.svg', + 'url': 'https://upload.wikimedia.org...', + 'caption': 'The Wikimedia Commons logo,...'}, + ... ] } """ - - url = "https://en.wikipedia.org/w/api.php" - - # Step 1: Check if the entity exists - search_params = { - 'action': 'query', - 'list': 'search', - 'srsearch': entity, - 'format': 'json' - } - - search_response = requests_get(url, params=search_params) - search_data = search_response - - if 'query' in search_data and search_data['query']['search']: - # Check if the exact title exists - exact_match = None - for result in search_data['query']['search']: - if result['title'].lower() == entity.lower(): - exact_match = result['title'] - break - if not exact_match: - similar_entities = [result['title'] for result in search_data['query']['search'][:5]] - return ServiceResponse(ServiceExecStatus.ERROR, {"similar_entities": similar_entities}) - entity = exact_match + existence_response = _check_entity_existence(entity) + if existence_response.status == ServiceExecStatus.ERROR: + return existence_response + + url = "https://en.wikipedia.org/w/api.php" - # Step 2: Get the list of images + params = { + "action": "query", + "prop": "images", + "titles": entity, + "format": "json", + } + data = requests_get(url, params=params) + page = next(iter(data["query"]["pages"].values())) + images = page.get("images", []) + if len(images) == 0: + return ServiceResponse(ServiceExecStatus.ERROR, None) + + image_details = [] + for image in images: + image_title = image["title"] params = { - 'action': 'query', - 'prop': 'images', - 'titles': entity, - 'format': 'json' + "action": "query", + "titles": image_title, + "prop": "imageinfo", + "iiprop": "url|extmetadata", + "format": "json", } - data = requests_get(url, params=params) - page = next(iter(data['query']['pages'].values())) - images = page.get('images', []) - if len(images) == 0: - return ServiceResponse(ServiceExecStatus.ERROR, None) - - # Step 3: Get details for each image - image_details = [] - for image in images: - image_title = image['title'] - params = { - 'action': 'query', - 'titles': image_title, - 'prop': 'imageinfo', - 'iiprop': 'url|extmetadata', - 'format': 'json' - } - response = requests.get(url, params=params) - data = response.json() - image_page = next(iter(data['query']['pages'].values())) - if 'imageinfo' in image_page: - image_info = image_page['imageinfo'][0] - image_url = image_info.get('url', '') - extmetadata = image_info.get('extmetadata', {}) - caption = extmetadata.get('ImageDescription', {}).get('value', 'No caption available') - image_details.append({ - 'title': image_title, - 'url': image_url, - 'caption': caption - }) - - return ServiceResponse(ServiceExecStatus.SUCCESS, image_details) - else: - return ServiceResponse(ServiceExecStatus.ERROR, {"error": "Entity not found"}) - - + response = requests.get(url, params=params) + data = response.json() + image_page = next(iter(data["query"]["pages"].values())) + if "imageinfo" in image_page: + image_info = image_page["imageinfo"][0] + image_url = image_info.get("url", "") + extmetadata = image_info.get("extmetadata", {}) + caption = extmetadata.get("ImageDescription", {}).get( + "value", + "No caption available", + ) + image_details.append( + { + "title": image_title, + "url": image_url, + "caption": caption, + }, + ) + + return ServiceResponse(ServiceExecStatus.SUCCESS, image_details) diff --git a/tests/wiki_test.py b/tests/wiki_test.py index 9481d45c2..1279e436f 100644 --- a/tests/wiki_test.py +++ b/tests/wiki_test.py @@ -1,88 +1,106 @@ +# -*- coding: utf-8 -*- """Wiki retriever test.""" import unittest from unittest.mock import Mock, patch, MagicMock from agentscope.service import ServiceResponse -from agentscope.service import get_category_members, get_infobox, get_page_content_by_paragraph, get_all_wikipedia_tables, get_page_images_with_captions +from agentscope.service import ( + wiki_get_category_members, + wiki_get_infobox, + wiki_get_page_content_by_paragraph, + wiki_get_all_wikipedia_tables, + wiki_get_page_images_with_captions, +) from agentscope.service.service_status import ServiceExecStatus + class TestWiki(unittest.TestCase): """ExampleTest for a unit test.""" - + @patch("agentscope.utils.common.requests.get") - def test_get_category_members(self, mock_get: MagicMock) -> None: + def test_wiki_get_category_members( + self, + mock_get: MagicMock, + ) -> None: """Test test_get_category_members""" mock_response = Mock() mock_dict = { - 'query': { - 'categorymembers': [{ - 'pageid': 20, - 'ns': 0, - 'title': 'This is a test' - }] - } - } - + "query": { + "categorymembers": [ + { + "pageid": 20, + "ns": 0, + "title": "This is a test", + }, + ], + }, + } + expected_result = ServiceResponse( status=ServiceExecStatus.SUCCESS, content=[ - {'pageid': 20, - 'ns': 0, - 'title': 'This is a test' - } - ] + { + "pageid": 20, + "ns": 0, + "title": "This is a test", + }, + ], ) - + mock_response.json.return_value = mock_dict mock_get.return_value = mock_response - + test_entity = "Test" - max_members=1 - limit_per_request=100 + max_members = 1 + limit_per_request = 100 params = { - 'action': 'query', - 'list': 'categorymembers', - 'cmtitle': f'Category:{test_entity}', - 'cmlimit': limit_per_request, # Maximum number of results per request - 'format': 'json' - } - - results = get_category_members( - entity=test_entity, + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{test_entity}", + "cmlimit": limit_per_request, + "format": "json", + } + + results = wiki_get_category_members( + entity=test_entity, max_members=max_members, - limit_per_request=limit_per_request - ) + limit_per_request=limit_per_request, + ) mock_get.assert_called_once_with( "https://en.wikipedia.org/w/api.php", params=params, ) - + self.assertEqual( results, expected_result, ) - + @patch("agentscope.utils.common.requests.get") - def test_get_infobox(self, mock_get: MagicMock) -> None: + def test_wiki_get_infobox( + self, + mock_get: MagicMock, + ) -> None: """Test get_infobox with different parameters and responses""" - + # Mock responses for search query mock_response_search = Mock() mock_dict_search = { - 'query': { - 'search': [ - {'title': 'Test'} - ] - } + "query": { + "search": [ + {"title": "Test"}, + ], + }, } - + # Mock responses for parse query mock_response_parse = Mock() mock_dict_parse = { - 'parse': { - 'title': 'Test', - 'pageid': 20, - 'text': { '*':""" + "parse": { + "title": "Test", + "pageid": 20, + "text": { + "*": """ @@ -93,72 +111,79 @@ def test_get_infobox(self, mock_get: MagicMock) -> None:
Column1Data2
- """ - } - } + """, + }, + }, } expected_result = ServiceResponse( status=ServiceExecStatus.SUCCESS, content={ - 'Column1': 'Data1', - 'Column2': 'Data2' - } + "Column1": "Data1", + "Column2": "Data2", + }, ) - # Set the side effect of the mock_get to return different responses in sequence mock_response_search.json.return_value = mock_dict_search mock_response_parse.json.return_value = mock_dict_parse mock_get.side_effect = [mock_response_search, mock_response_parse] test_entity = "Test" - results = get_infobox(entity=test_entity) + results = wiki_get_infobox(entity=test_entity) # Define expected calls calls = [ - unittest.mock.call("https://en.wikipedia.org/w/api.php", params={ - 'action': 'query', - 'list': 'search', - 'srsearch': test_entity, - 'format': 'json' - }), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params={ - 'action': 'parse', - 'page': test_entity, - 'prop': 'text', - 'format': 'json' - }) + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params={ + "action": "query", + "list": "search", + "srsearch": test_entity, + "format": "json", + }, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params={ + "action": "parse", + "page": test_entity, + "prop": "text", + "format": "json", + }, + ), ] - + mock_get.assert_has_calls(calls, any_order=True) self.assertEqual(results, expected_result) - - + @patch("agentscope.utils.common.requests.get") - def test_get_page_content_by_paragraph(self, mock_get: MagicMock) -> None: - """Test get_page_content_by_paragraph with different parameters and responses""" - + def test_wiki_get_page_content_by_paragraph( + self, + mock_get: MagicMock, + ) -> None: + """Test get_page_content_by_paragraph""" + # Mock responses for search query mock_response_search = Mock() mock_dict_search = { - 'query': { - 'search': [ - {'title': 'Test'} - ] - } + "query": { + "search": [ + {"title": "Test"}, + ], + }, } - + # Mock responses for extract query mock_response_extract = Mock() mock_dict_extract = { - 'query': { - 'pages': { - '20': { - 'pageid': 20, - 'title': 'Test', - 'extract': """ + "query": { + "pages": { + "20": { + "pageid": 20, + "title": "Test", + "extract": """ This is the first paragraph. This is the second paragraph. @@ -166,74 +191,86 @@ def test_get_page_content_by_paragraph(self, mock_get: MagicMock) -> None: == Section Header == This is the third paragraph under a section header. - """ - } - } - } + """, + }, + }, + }, } expected_result = ServiceResponse( status=ServiceExecStatus.SUCCESS, content=[ "This is the first paragraph.", - "This is the second paragraph." - ] + "This is the second paragraph.", + ], ) - # Set the side effect of the mock_get to return different responses in sequence mock_response_search.json.return_value = mock_dict_search mock_response_extract.json.return_value = mock_dict_extract mock_get.side_effect = [mock_response_search, mock_response_extract] test_entity = "Test" - results = get_page_content_by_paragraph(entity=test_entity, max_paragraphs=2) + results = wiki_get_page_content_by_paragraph( + entity=test_entity, + max_paragraphs=2, + ) # Define expected calls params1 = { - 'action': 'query', - 'list': 'search', - 'srsearch': test_entity, - 'format': 'json' + "action": "query", + "list": "search", + "srsearch": test_entity, + "format": "json", } params2 = { - 'action': 'query', - 'prop': 'extracts', - 'explaintext': True, - 'titles': test_entity, - 'format': 'json' + "action": "query", + "prop": "extracts", + "explaintext": True, + "titles": test_entity, + "format": "json", } calls = [ - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2) + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params1, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params2, + ), ] - + mock_get.assert_has_calls(calls, any_order=True) self.assertEqual(results, expected_result) - + @patch("agentscope.utils.common.requests.get") - def test_get_all_wikipedia_tables(self, mock_get: MagicMock) -> None: - """Test get_all_wikipedia_tables with different parameters and responses""" - + def test_wiki_get_all_wikipedia_tables( + self, + mock_get: MagicMock, + ) -> None: + """Test get_all_wikipedia_tables""" + # Mock responses for search query mock_response_search = Mock() mock_dict_search = { - 'query': { - 'search': [ - {'title': 'Test'} - ] - } + "query": { + "search": [ + {"title": "Test"}, + ], + }, } - + # Mock responses for parse query mock_response_parse = Mock() mock_dict_parse = { - 'parse': { - 'title': 'Test', - 'pageid': 20, - 'text': { '*':""" + "parse": { + "title": "Test", + "pageid": 20, + "text": { + "*": """ @@ -248,183 +285,217 @@ def test_get_all_wikipedia_tables(self, mock_get: MagicMock) -> None:
Header1Row2Col2
- """ - } - } + """, + }, + }, } expected_result = ServiceResponse( status=ServiceExecStatus.SUCCESS, - content=[{ - 'Header1': ['Row1Col1', 'Row2Col1'], - 'Header2': ['Row1Col2', 'Row2Col2'] - }] + content=[ + { + "Header1": ["Row1Col1", "Row2Col1"], + "Header2": ["Row1Col2", "Row2Col2"], + }, + ], ) - # Set the side effect of the mock_get to return different responses in sequence mock_response_search.json.return_value = mock_dict_search mock_response_parse.json.return_value = mock_dict_parse mock_get.side_effect = [mock_response_search, mock_response_parse] test_entity = "Test" - results = get_all_wikipedia_tables(entity=test_entity) + results = wiki_get_all_wikipedia_tables(entity=test_entity) # Define expected calls params1 = { - 'action': 'query', - 'list': 'search', - 'srsearch': test_entity, - 'format': 'json' + "action": "query", + "list": "search", + "srsearch": test_entity, + "format": "json", } params2 = { - 'action': 'parse', - 'page': test_entity, - 'prop': 'text', - 'format': 'json' + "action": "parse", + "page": test_entity, + "prop": "text", + "format": "json", } calls = [ - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2) + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params1, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params2, + ), ] - + mock_get.assert_has_calls(calls, any_order=True) self.assertEqual(results, expected_result) - - + @patch("agentscope.utils.common.requests.get") - def test_get_page_images_with_captions(self, mock_get: MagicMock) -> None: - """Test get_page_images_with_captions with different parameters and responses""" - + def test_get_page_images_with_captions( + self, + mock_get: MagicMock, + ) -> None: + """Test get_page_images_with_captions""" + # Mock responses for search query mock_response_search = Mock() mock_dict_search = { - 'query': { - 'search': [ - {'title': 'Test'} - ] - } + "query": { + "search": [ + {"title": "Test"}, + ], + }, } - + # Mock responses for images query mock_response_images = Mock() mock_dict_images = { - 'query': { - 'pages': { - '20': { - 'pageid': 20, - 'title': 'Test', - 'images': [ - {'title': 'Image1'}, - {'title': 'Image2'} - ] - } - } - } + "query": { + "pages": { + "20": { + "pageid": 20, + "title": "Test", + "images": [ + {"title": "Image1"}, + {"title": "Image2"}, + ], + }, + }, + }, } - + # Mock responses for image details query mock_response_image1 = Mock() mock_dict_image1 = { - 'query': { - 'pages': { - '30': { - 'pageid': 30, - 'imageinfo': [{ - 'url': 'http://example.com/image1.jpg', - 'extmetadata': { - 'ImageDescription': {'value': 'Caption for image 1'} - } - }] - } - } - } + "query": { + "pages": { + "30": { + "pageid": 30, + "imageinfo": [ + { + "url": "http://example.com/image1.jpg", + "extmetadata": { + "ImageDescription": { + "value": "Caption for image 1", + }, + }, + }, + ], + }, + }, + }, } - + mock_response_image2 = Mock() mock_dict_image2 = { - 'query': { - 'pages': { - '31': { - 'pageid': 31, - 'imageinfo': [{ - 'url': 'http://example.com/image2.jpg', - 'extmetadata': { - 'ImageDescription': {'value': 'Caption for image 2'} - } - }] - } - } - } + "query": { + "pages": { + "31": { + "pageid": 31, + "imageinfo": [ + { + "url": "http://example.com/image2.jpg", + "extmetadata": { + "ImageDescription": { + "value": "Caption for image 2", + }, + }, + }, + ], + }, + }, + }, } expected_result = ServiceResponse( status=ServiceExecStatus.SUCCESS, content=[ { - 'title': 'Image1', - 'url': 'http://example.com/image1.jpg', - 'caption': 'Caption for image 1' + "title": "Image1", + "url": "http://example.com/image1.jpg", + "caption": "Caption for image 1", }, { - 'title': 'Image2', - 'url': 'http://example.com/image2.jpg', - 'caption': 'Caption for image 2' - } - ] + "title": "Image2", + "url": "http://example.com/image2.jpg", + "caption": "Caption for image 2", + }, + ], ) - # Set the side effect of the mock_get to return different responses in sequence mock_response_search.json.return_value = mock_dict_search mock_response_images.json.return_value = mock_dict_images mock_response_image1.json.return_value = mock_dict_image1 mock_response_image2.json.return_value = mock_dict_image2 - mock_get.side_effect = [mock_response_search, mock_response_images, mock_response_image1, mock_response_image2] + mock_get.side_effect = [ + mock_response_search, + mock_response_images, + mock_response_image1, + mock_response_image2, + ] test_entity = "Test" - results = get_page_images_with_captions(entity=test_entity) + results = wiki_get_page_images_with_captions(entity=test_entity) # Define expected calls params1 = { - 'action': 'query', - 'list': 'search', - 'srsearch': test_entity, - 'format': 'json' + "action": "query", + "list": "search", + "srsearch": test_entity, + "format": "json", } params2 = { - 'action': 'query', - 'prop': 'images', - 'titles': test_entity, - 'format': 'json' + "action": "query", + "prop": "images", + "titles": test_entity, + "format": "json", } params3_image1 = { - 'action': 'query', - 'titles': 'Image1', - 'prop': 'imageinfo', - 'iiprop': 'url|extmetadata', - 'format': 'json' + "action": "query", + "titles": "Image1", + "prop": "imageinfo", + "iiprop": "url|extmetadata", + "format": "json", } params4_image2 = { - 'action': 'query', - 'titles': 'Image2', - 'prop': 'imageinfo', - 'iiprop': 'url|extmetadata', - 'format': 'json' + "action": "query", + "titles": "Image2", + "prop": "imageinfo", + "iiprop": "url|extmetadata", + "format": "json", } calls = [ - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params1), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params2), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params3_image1), - unittest.mock.call("https://en.wikipedia.org/w/api.php", params=params4_image2) + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params1, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params2, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params3_image1, + ), + unittest.mock.call( + "https://en.wikipedia.org/w/api.php", + params=params4_image2, + ), ] - + mock_get.assert_has_calls(calls, any_order=True) self.assertEqual(results, expected_result) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From cb8706765447402bd7aa2769af14901c60a54635 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Fri, 28 Jun 2024 13:50:21 -0400 Subject: [PATCH 05/16] fix format --- src/agentscope/service/web/wiki.py | 41 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index a138c36c4..c13f097b6 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -5,7 +5,8 @@ """ import re -import requests + +# import requests from bs4 import BeautifulSoup from agentscope.service.service_response import ( @@ -45,11 +46,10 @@ def _check_entity_existence(entity: str) -> ServiceResponse: ServiceExecStatus.SUCCESS, {"entity": exact_match}, ) - else: - return ServiceResponse( - ServiceExecStatus.ERROR, - {"error": "Entity not found"}, - ) + return ServiceResponse( + ServiceExecStatus.ERROR, + {"error": "Entity not found"}, + ) def wiki_get_category_members( @@ -96,7 +96,8 @@ def wiki_get_category_members( { 'status': , 'content': [ - {'pageid': 67911196, 'ns': 0, 'title': 'Bayesian learning mechanisms'}, + {'pageid': 67911196, 'ns': 0, + 'title': 'Bayesian learning mechanisms'}, {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'}, ...] @@ -133,8 +134,8 @@ def wiki_get_category_members( if len(members) > 0: return ServiceResponse(ServiceExecStatus.SUCCESS, members) - else: - return ServiceResponse(ServiceExecStatus.ERROR, members) + + return ServiceResponse(ServiceExecStatus.ERROR, members) def wiki_get_infobox( @@ -211,15 +212,14 @@ def wiki_get_infobox( infobox_data[key] = val return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data) - else: - error_message = parse_data.get("error", {}).get( - "info", - "Unknown error occurred", - ) - return ServiceResponse( - ServiceExecStatus.ERROR, - {"error": error_message}, - ) + error_message = parse_data.get("error", {}).get( + "info", + "Unknown error occurred", + ) + return ServiceResponse( + ServiceExecStatus.ERROR, + {"error": error_message}, + ) def wiki_get_page_content_by_paragraph( @@ -358,7 +358,7 @@ def wiki_get_all_wikipedia_tables( return ServiceResponse(ServiceExecStatus.ERROR, None) all_tables_data = [] - for table_index, table in enumerate(tables): + for _, table in enumerate(tables): headers = [ header.get_text(strip=True) for header in table.find_all("th") ] @@ -451,8 +451,7 @@ def wiki_get_page_images_with_captions( "iiprop": "url|extmetadata", "format": "json", } - response = requests.get(url, params=params) - data = response.json() + data = requests_get(url, params=params) image_page = next(iter(data["query"]["pages"].values())) if "imageinfo" in image_page: image_info = image_page["imageinfo"][0] From ebb2af839d3cae600034832d1a5cce3818ffee4e Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 3 Jul 2024 13:44:30 -0400 Subject: [PATCH 06/16] modify url --- src/agentscope/service/web/wiki.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index c13f097b6..17df81c37 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -16,8 +16,12 @@ from agentscope.utils.common import requests_get -def _check_entity_existence(entity: str) -> ServiceResponse: +def wiki_api(params): url = "https://en.wikipedia.org/w/api.php" + return requests_get(url, params=params) + + +def _check_entity_existence(entity: str) -> ServiceResponse: search_params = { "action": "query", "list": "search", @@ -25,7 +29,7 @@ def _check_entity_existence(entity: str) -> ServiceResponse: "format": "json", } - search_data = requests_get(url, params=search_params) + search_data = wiki_api(search_params) if "query" in search_data and search_data["query"]["search"]: exact_match = None @@ -104,7 +108,6 @@ def wiki_get_category_members( } """ - url = "https://en.wikipedia.org/w/api.php" params = { "action": "query", "list": "categorymembers", @@ -117,7 +120,7 @@ def wiki_get_category_members( total_fetched = 0 while total_fetched < max_members: - data = requests_get(url, params=params) + data = wiki_api(params) batch_members = data["query"]["categorymembers"] members.extend(batch_members) total_fetched += len(batch_members) @@ -184,7 +187,6 @@ def wiki_get_infobox( if existence_response.status == ServiceExecStatus.ERROR: return existence_response - url = "https://en.wikipedia.org/w/api.php" parse_params = { "action": "parse", "page": entity, @@ -192,7 +194,7 @@ def wiki_get_infobox( "format": "json", } - parse_data = requests_get(url, params=parse_params) + parse_data = wiki_api(parse_params) if "parse" in parse_data: raw_html = parse_data["parse"]["text"]["*"] @@ -265,7 +267,6 @@ def wiki_get_page_content_by_paragraph( if existence_response.status == ServiceExecStatus.ERROR: return existence_response - url = "https://en.wikipedia.org/w/api.php" params = { "action": "query", "prop": "extracts", @@ -274,7 +275,7 @@ def wiki_get_page_content_by_paragraph( "format": "json", } - data = requests_get(url, params=params) + data = wiki_api(params) page = next(iter(data["query"]["pages"].values())) content = page.get("extract", "No content found.") if content == "No content found.": @@ -340,7 +341,6 @@ def wiki_get_all_wikipedia_tables( if existence_response.status == ServiceExecStatus.ERROR: return existence_response - url = "https://en.wikipedia.org/w/api.php" params = { "action": "parse", "page": entity, @@ -348,7 +348,7 @@ def wiki_get_all_wikipedia_tables( "format": "json", } - data = requests_get(url, params=params) + data = wiki_api(params) raw_html = data["parse"]["text"]["*"] soup = BeautifulSoup(raw_html, "html.parser") @@ -427,15 +427,13 @@ def wiki_get_page_images_with_captions( if existence_response.status == ServiceExecStatus.ERROR: return existence_response - url = "https://en.wikipedia.org/w/api.php" - params = { "action": "query", "prop": "images", "titles": entity, "format": "json", } - data = requests_get(url, params=params) + data = wiki_api(params) page = next(iter(data["query"]["pages"].values())) images = page.get("images", []) if len(images) == 0: @@ -451,7 +449,7 @@ def wiki_get_page_images_with_captions( "iiprop": "url|extmetadata", "format": "json", } - data = requests_get(url, params=params) + data = wiki_api(params) image_page = next(iter(data["query"]["pages"].values())) if "imageinfo" in image_page: image_info = image_page["imageinfo"][0] From 44ec6c3f7241a48095ee42f34b1ae78fb97179a7 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 3 Jul 2024 14:09:49 -0400 Subject: [PATCH 07/16] add bs4 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 8055476f2..c87755f06 100644 --- a/setup.py +++ b/setup.py @@ -83,6 +83,7 @@ "litellm", "psutil", "scipy", + "bs4", ] distribute_requires = minimal_requires + rpc_requires From 62b31b58974ccc8169cab28083ed58e8c8338f42 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 3 Jul 2024 14:11:24 -0400 Subject: [PATCH 08/16] modify minimal requirements --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index c87755f06..b319de417 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,6 @@ "docker", "pymongo", "pymysql", - "bs4", - "beautifulsoup4", "feedparser", ] @@ -84,6 +82,7 @@ "psutil", "scipy", "bs4", + "beautifulsoup4", ] distribute_requires = minimal_requires + rpc_requires From 3bc4039761559559b4cff5fd1b2dc9c4cb6f0309 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 3 Jul 2024 17:30:16 -0400 Subject: [PATCH 09/16] correct formats --- src/agentscope/service/web/wiki.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index 17df81c37..05e3ef3e0 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -16,7 +16,8 @@ from agentscope.utils.common import requests_get -def wiki_api(params): +def wiki_api(params: dict) -> dict: + """Scratch information via Wiki API""" url = "https://en.wikipedia.org/w/api.php" return requests_get(url, params=params) From 925d856ea3f79047c4e28667e308fca9edb5df42 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 3 Jul 2024 19:22:52 -0400 Subject: [PATCH 10/16] modify _check_entity_existence --- src/agentscope/service/web/wiki.py | 35 +++++++++++++++--------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index 05e3ef3e0..4d75f3103 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -32,25 +32,26 @@ def _check_entity_existence(entity: str) -> ServiceResponse: search_data = wiki_api(search_params) - if "query" in search_data and search_data["query"]["search"]: - exact_match = None - for result in search_data["query"]["search"]: - if result["title"].lower() == entity.lower(): - exact_match = result["title"] - break - if not exact_match: - similar_entities = [ - result["title"] - for result in search_data["query"]["search"][:5] - ] + if "query" in search_data and "search" in search_data["query"]: + if search_data["query"]["search"]: + exact_match = None + for result in search_data["query"]["search"]: + if result["title"].lower() == entity.lower(): + exact_match = result["title"] + break + if not exact_match: + similar_entities = [ + result["title"] + for result in search_data["query"]["search"][:5] + ] + return ServiceResponse( + ServiceExecStatus.ERROR, + {"similar_entities": similar_entities}, + ) return ServiceResponse( - ServiceExecStatus.ERROR, - {"similar_entities": similar_entities}, + ServiceExecStatus.SUCCESS, + {"entity": exact_match}, ) - return ServiceResponse( - ServiceExecStatus.SUCCESS, - {"entity": exact_match}, - ) return ServiceResponse( ServiceExecStatus.ERROR, {"error": "Entity not found"}, From e9d540c220d2c43ab424db615ccde4f1b5bd74d8 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Fri, 26 Jul 2024 02:28:18 -0400 Subject: [PATCH 11/16] fix comments --- src/agentscope/service/web/wiki.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index 4d75f3103..eba9e33c9 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -6,7 +6,7 @@ import re -# import requests + from bs4 import BeautifulSoup from agentscope.service.service_response import ( @@ -16,13 +16,24 @@ from agentscope.utils.common import requests_get -def wiki_api(params: dict) -> dict: +def _wiki_api(params: dict) -> dict: """Scratch information via Wiki API""" url = "https://en.wikipedia.org/w/api.php" return requests_get(url, params=params) def _check_entity_existence(entity: str) -> ServiceResponse: + """ + Function to check if the eneity exists in Wikipedia + If yes, continue searching; + if not, return top 5 similar entities + + Args: + entity (str): searching keywords + + Returns: + + """ search_params = { "action": "query", "list": "search", @@ -30,7 +41,7 @@ def _check_entity_existence(entity: str) -> ServiceResponse: "format": "json", } - search_data = wiki_api(search_params) + search_data = _wiki_api(search_params) if "query" in search_data and "search" in search_data["query"]: if search_data["query"]["search"]: @@ -68,7 +79,7 @@ def wiki_get_category_members( Args: entity (str): searching keywords max_members (int): maximum number of members to output - limit_per_request (int): number of members retrieved per quest + limit_per_request (int): number of members retrieved per request Returns: `ServiceResponse`: A dictionary containing `status` and `content`. @@ -122,7 +133,7 @@ def wiki_get_category_members( total_fetched = 0 while total_fetched < max_members: - data = wiki_api(params) + data = _wiki_api(params) batch_members = data["query"]["categorymembers"] members.extend(batch_members) total_fetched += len(batch_members) @@ -277,7 +288,7 @@ def wiki_get_page_content_by_paragraph( "format": "json", } - data = wiki_api(params) + data = _wiki_api(params) page = next(iter(data["query"]["pages"].values())) content = page.get("extract", "No content found.") if content == "No content found.": @@ -350,7 +361,7 @@ def wiki_get_all_wikipedia_tables( "format": "json", } - data = wiki_api(params) + data = _wiki_api(params) raw_html = data["parse"]["text"]["*"] soup = BeautifulSoup(raw_html, "html.parser") @@ -435,7 +446,7 @@ def wiki_get_page_images_with_captions( "titles": entity, "format": "json", } - data = wiki_api(params) + data = _wiki_api(params) page = next(iter(data["query"]["pages"].values())) images = page.get("images", []) if len(images) == 0: @@ -451,7 +462,7 @@ def wiki_get_page_images_with_captions( "iiprop": "url|extmetadata", "format": "json", } - data = wiki_api(params) + data = _wiki_api(params) image_page = next(iter(data["query"]["pages"].values())) if "imageinfo" in image_page: image_info = image_page["imageinfo"][0] From 732db08fe4dca648d090e37d65fe5b8e1d44b92d Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Mon, 29 Jul 2024 01:42:53 -0400 Subject: [PATCH 12/16] fix comments --- src/agentscope/service/__init__.py | 1 + src/agentscope/service/web/wiki.py | 162 ++++++++++++++++++++++++++++- tests/wiki_test.py | 2 + 3 files changed, 160 insertions(+), 5 deletions(-) diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index 0de13f91d..f4af8e325 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -47,6 +47,7 @@ wiki_get_page_content_by_paragraph, wiki_get_all_wikipedia_tables, wiki_get_page_images_with_captions, + wiki_page_retrieval ) diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py index eba9e33c9..9dc9f10c8 100644 --- a/src/agentscope/service/web/wiki.py +++ b/src/agentscope/service/web/wiki.py @@ -32,7 +32,49 @@ def _check_entity_existence(entity: str) -> ServiceResponse: entity (str): searching keywords Returns: - + `ServiceResponse`: A dictionary containing `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR + and return top-5 similar entities in `content`. + If entity exists, `status`=SUCCESS, return the original entity in `content`. + + Example 1 (entity exists): + .. code-block:: python + + _check_entity_existence('Hello') + + It returns: + .. code-block:: python + + { + 'status': , + 'content': { + 'entity': 'Hello' + } + } + + Example 2 (entity does not exist): + .. code-block:: python + + _check_entity_existence('nihao') + + It returns: + .. code-block:: python + + { + 'status': , + 'content': { + 'similar_entities': [ + 'Ni Hao', + 'Ranma ½', + 'Ni Hao, Kai-Lan', + 'List of Ranma ½ episodes', + 'Studio Deen' + ] + } + } + """ search_params = { "action": "query", @@ -101,6 +143,7 @@ def wiki_get_category_members( Example: .. code-block:: python + members = wiki_get_category_members( "Machine_learning", max_members=10 @@ -110,6 +153,7 @@ def wiki_get_category_members( It returns contents: .. code-block:: python + { 'status': , 'content': [ @@ -175,12 +219,14 @@ def wiki_get_infobox( Example: .. code-block:: python + infobox_data = wiki_get_infobox(entity="Python (programming language)") print(infobox_data) It returns content: .. code-block:: python + { 'status': , 'content': {'Paradigm': 'Multi-paradigm : object-oriented ...', @@ -207,7 +253,7 @@ def wiki_get_infobox( "format": "json", } - parse_data = wiki_api(parse_params) + parse_data = _wiki_api(parse_params) if "parse" in parse_data: raw_html = parse_data["parse"]["text"]["*"] @@ -249,7 +295,7 @@ def wiki_get_page_content_by_paragraph( entity (str): search word. max_paragraphs (int, optional): The maximum number of paragraphs to retrieve. - Default is None (retrieve all paragraphs). + Default is 1 (retrieve the first paragraph). Returns: `ServiceResponse`: A dictionary containing `status` and `content`. @@ -263,6 +309,7 @@ def wiki_get_page_content_by_paragraph( Example: .. code-block:: python + wiki_paragraph = wiki_get_page_content_by_paragraph( entity="Python (programming language)", max_paragraphs=1) @@ -270,6 +317,7 @@ def wiki_get_page_content_by_paragraph( It will return content: .. code-block:: python + { 'status': , 'content': ['Python is a high-level...'] @@ -294,11 +342,11 @@ def wiki_get_page_content_by_paragraph( if content == "No content found.": return ServiceResponse(ServiceExecStatus.ERROR, content) - # Split content into paragraphs and filter out headers + # Split content into paragraphs, including headers paragraphs = [ para.strip() for para in content.split("\n\n") - if not re.match(r"^\s*==.*==\s*$", para) and para.strip() != "" + if para.strip() != "" ] # Return the specified number of paragraphs @@ -331,6 +379,7 @@ def wiki_get_all_wikipedia_tables( Example: .. code-block:: python + wiki_table = wiki_get_all_wikipedia_tables( entity="Python (programming language)" ) @@ -338,6 +387,7 @@ def wiki_get_all_wikipedia_tables( It will return content: .. code-block:: python + { 'status': , 'content': [ @@ -417,6 +467,7 @@ def wiki_get_page_images_with_captions( Example: .. code-block:: python + wiki_images = wiki_get_page_images_with_captions( entity="Python (programming language)" ) @@ -425,6 +476,7 @@ def wiki_get_page_images_with_captions( It will return: .. code-block:: python + { 'status': , 'content': [{ @@ -481,3 +533,103 @@ def wiki_get_page_images_with_captions( ) return ServiceResponse(ServiceExecStatus.SUCCESS, image_details) + + +def wiki_page_retrieval( + entity: str, + max_paragraphs: int = 1, +)-> ServiceResponse: + """ + Function to retrive different format + (infobox, paragraphs, tables, images) + of information on the Wikipedia page + + Args: + entity (str): search word. + max_paragraphs (int, optional): + The maximum number of paragraphs to retrieve. + Default is 1 (retrieve the first paragraph). + + Returns: + A dictionary contains retrieved information of different format. + Keys are four formats: `infobox`, `paragraph`, `table`, `image`. + The value for each key is a `ServiceResponse` object containing + `status` and `content`. + The `status` attribute is from the ServiceExecStatus enum, + indicating success or error. + If the entity does not exist, `status`=ERROR, + otherwise `status`=SUCCESS. + The `content` attribute is the retrieved contents + if `status`=SUCCESS. Contents are different for each format. + `infobox`: Information in the InfoBox. + `paragraph`: A list of paragraphs from the Wikipedia page. The number + of paragraphs is determined by arg `max_paragraphs`. + `table`: A list of tables from the Wikipedia page. Each table + is presented as a dict, where key is the + column name and value is the values for each column. + `image`: A list of dict from the Wikipedia page. + Each dict has: + 'title': title of the image + 'url': link to the image + 'caption': caption of the image + + Example: + .. code-block:: python + + wiki_page_retrieval(entity='Hello', max_paragraphs=1) + + It will return: + + .. code-block:: python + + { + 'infobox': { + 'status': , + 'content': None + }, + 'paragraph': { + 'status': , + 'content': ['Hello is a salutation or greeting in the English language. It is first attested in writing from 1826.'] + }, + 'table': { + 'status': , + 'content': None + }, + 'image': { + 'status': , + 'content': [ + { + 'title': 'File:Semi-protection-shackle.svg', + 'url': 'https://upload.wikimedia.org/wikipedia/en/1/1b/Semi-protection-shackle.svg', + 'caption': '

English: Semi-protection lock with grey shackle\n

' + }, + { + 'title': 'File:TelephoneHelloNellie.jpg', + 'url': 'https://upload.wikimedia.org/wikipedia/commons/b/b3/TelephoneHelloNellie.jpg', + 'caption': 'No caption available' + }, + { + 'title': 'File:Wiktionary-logo-en-v2.svg', + 'url': 'https://upload.wikimedia.org/wikipedia/commons/9/99/Wiktionary-logo-en-v2.svg', + 'caption': 'A logo derived from ...' + } + ] + } + } + + """ + + infobox_retrieval = wiki_get_infobox(entity=entity) + paragraph_retrieval = wiki_get_page_content_by_paragraph( + entity=entity, max_paragraphs=max_paragraphs) + table_retrieval = wiki_get_all_wikipedia_tables(entity=entity) + image_retrieval = wiki_get_page_images_with_captions(entity=entity) + + total_retrieval = { + 'infobox': infobox_retrieval, + 'paragraph': paragraph_retrieval, + 'table': table_retrieval, + 'image': image_retrieval + } + + return total_retrieval diff --git a/tests/wiki_test.py b/tests/wiki_test.py index 1279e436f..25390176f 100644 --- a/tests/wiki_test.py +++ b/tests/wiki_test.py @@ -10,6 +10,7 @@ wiki_get_page_content_by_paragraph, wiki_get_all_wikipedia_tables, wiki_get_page_images_with_captions, + wiki_page_retrieval ) from agentscope.service.service_status import ServiceExecStatus @@ -495,6 +496,7 @@ def test_get_page_images_with_captions( mock_get.assert_has_calls(calls, any_order=True) self.assertEqual(results, expected_result) + if __name__ == "__main__": From 8985cd4a2a63d0149b7706fe277bc10e201ec8b3 Mon Sep 17 00:00:00 2001 From: PengfeiHePower Date: Wed, 7 Aug 2024 16:12:31 -0400 Subject: [PATCH 13/16] modify libraries --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index b319de417..4d6c3f169 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,8 @@ "pymongo", "pymysql", "feedparser", + "bs4", + "beautifulsoup4", ] doc_requires = [ @@ -81,8 +83,6 @@ "litellm", "psutil", "scipy", - "bs4", - "beautifulsoup4", ] distribute_requires = minimal_requires + rpc_requires From 42ad7cc2fb5e71c1d53505114bb0fa292b2c61cb Mon Sep 17 00:00:00 2001 From: DavdGao Date: Tue, 13 Aug 2024 16:06:42 +0800 Subject: [PATCH 14/16] Rewrite wikipedia search; Modify unittests --- .../en/source/tutorial/204-service.md | 66 +- .../zh_CN/source/tutorial/204-service.md | 4 +- src/agentscope/service/__init__.py | 17 +- src/agentscope/service/web/wiki.py | 635 ------------------ src/agentscope/service/web/wikipedia.py | 161 +++++ tests/wiki_test.py | 443 +----------- 6 files changed, 229 insertions(+), 1097 deletions(-) delete mode 100644 src/agentscope/service/web/wiki.py create mode 100644 src/agentscope/service/web/wikipedia.py diff --git a/docs/sphinx_doc/en/source/tutorial/204-service.md b/docs/sphinx_doc/en/source/tutorial/204-service.md index dad6fa3d9..5c9456dee 100644 --- a/docs/sphinx_doc/en/source/tutorial/204-service.md +++ b/docs/sphinx_doc/en/source/tutorial/204-service.md @@ -12,38 +12,40 @@ AgentScope and how to use them to enhance the capabilities of your agents. The following table outlines the various Service functions by type. These functions can be called using `agentscope.service.{function_name}`. -| Service Scene | Service Function Name | Description | -|-----------------------------|----------------------------|----------------------------------------------------------------------------------------------------------------| -| Code | `execute_python_code` | Execute a piece of Python code, optionally inside a Docker container. | -| Retrieval | `retrieve_from_list` | Retrieve a specific item from a list based on given criteria. | -| | `cos_sim` | Compute the cosine similarity between two different embeddings. | -| SQL Query | `query_mysql` | Execute SQL queries on a MySQL database and return results. | -| | `query_sqlite` | Execute SQL queries on a SQLite database and return results. | -| | `query_mongodb` | Perform queries or operations on a MongoDB collection. | -| Text Processing | `summarization` | Summarize a piece of text using a large language model to highlight its main points. | -| Web | `bing_search` | Perform bing search | -| | `google_search` | Perform google search | -| | `arxiv_search` | Perform arXiv search | -| | `download_from_url` | Download file from given URL. | -| | `load_web` | Load and parse the web page of the specified url (currently only supports HTML). | -| | `digest_webpage` | Digest the content of a already loaded web page (currently only supports HTML). -| | `dblp_search_publications` | Search publications in the DBLP database -| | `dblp_search_authors` | Search for author information in the DBLP database | -| | `dblp_search_venues` | Search for venue information in the DBLP database | -| File | `create_file` | Create a new file at a specified path, optionally with initial content. | -| | `delete_file` | Delete a file specified by a file path. | -| | `move_file` | Move or rename a file from one path to another. | -| | `create_directory` | Create a new directory at a specified path. | -| | `delete_directory` | Delete a directory and all its contents. | -| | `move_directory` | Move or rename a directory from one path to another. | -| | `read_text_file` | Read and return the content of a text file. | -| | `write_text_file` | Write text content to a file at a specified path. | -| | `read_json_file` | Read and parse the content of a JSON file. | -| | `write_json_file` | Serialize a Python object to JSON and write to a file. | -| Multi Modality | `dashscope_text_to_image` | Convert text to image using Dashscope API. | -| | `dashscope_image_to_text` | Convert image to text using Dashscope API. | -| | `dashscope_text_to_audio` | Convert text to audio using Dashscope API. | -| *More services coming soon* | | More service functions are in development and will be added to AgentScope to further enhance its capabilities. | +| Service Scene | Service Function Name | Description | +|-----------------------------|--------------------------------|----------------------------------------------------------------------------------------------------------------| +| Code | `execute_python_code` | Execute a piece of Python code, optionally inside a Docker container. | +| Retrieval | `retrieve_from_list` | Retrieve a specific item from a list based on given criteria. | +| | `cos_sim` | Compute the cosine similarity between two different embeddings. | +| SQL Query | `query_mysql` | Execute SQL queries on a MySQL database and return results. | +| | `query_sqlite` | Execute SQL queries on a SQLite database and return results. | +| | `query_mongodb` | Perform queries or operations on a MongoDB collection. | +| Text Processing | `summarization` | Summarize a piece of text using a large language model to highlight its main points. | +| Web | `bing_search` | Perform bing search | +| | `google_search` | Perform google search | +| | `arxiv_search` | Perform arXiv search | +| | `download_from_url` | Download file from given URL. | +| | `load_web` | Load and parse the web page of the specified url (currently only supports HTML). | +| | `digest_webpage` | Digest the content of a already loaded web page (currently only supports HTML). +| | `dblp_search_publications` | Search publications in the DBLP database +| | `dblp_search_authors` | Search for author information in the DBLP database | +| | `dblp_search_venues` | Search for venue information in the DBLP database | +| | `wikipedia_search` | Search for the given query in Wikipedia API | +| | `wikipedia_search_categories` | Search categories for the given query in Wikipedia:Category pages. | +| File | `create_file` | Create a new file at a specified path, optionally with initial content. | +| | `delete_file` | Delete a file specified by a file path. | +| | `move_file` | Move or rename a file from one path to another. | +| | `create_directory` | Create a new directory at a specified path. | +| | `delete_directory` | Delete a directory and all its contents. | +| | `move_directory` | Move or rename a directory from one path to another. | +| | `read_text_file` | Read and return the content of a text file. | +| | `write_text_file` | Write text content to a file at a specified path. | +| | `read_json_file` | Read and parse the content of a JSON file. | +| | `write_json_file` | Serialize a Python object to JSON and write to a file. | +| Multi Modality | `dashscope_text_to_image` | Convert text to image using Dashscope API. | +| | `dashscope_image_to_text` | Convert image to text using Dashscope API. | +| | `dashscope_text_to_audio` | Convert text to audio using Dashscope API. | +| *More services coming soon* | | More service functions are in development and will be added to AgentScope to further enhance its capabilities. | About each service function, you can find detailed information in the [API document](https://modelscope.github.io/agentscope/). diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md b/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md index 788d2bdad..85e4174fb 100644 --- a/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md +++ b/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md @@ -27,6 +27,8 @@ | | `dblp_search_publications` | 在dblp数据库里搜索文献。 | | `dblp_search_authors` | 在dblp数据库里搜索作者。 | | | `dblp_search_venues` | 在dblp数据库里搜索期刊,会议及研讨会。 | +| | `wikipedia_search` | 在Wikipedia中进行搜索。 | +| | `wikipedia_search_categories` | 在Wikipedia的Category中搜索相关的category。 | | 文件处理 | `create_file` | 在指定路径创建一个新文件,并可选择添加初始内容。 | | | `delete_file` | 删除由文件路径指定的文件。 | | | `move_file` | 将文件从一个路径移动或重命名到另一个路径。 | @@ -39,7 +41,7 @@ | | `write_json_file` | 将 Python 对象序列化为 JSON 并写入到文件。 | | 多模态 | `dashscope_text_to_image` | 使用 DashScope API 将文本生成图片。 | | | `dashscope_image_to_text` | 使用 DashScope API 根据图片生成文字。 | -| | `dashscope_text_to_audio` | 使用 DashScope API 根据文本生成音频。 | +| | `dashscope_text_to_audio` | 使用 DashScope API 根据文本生成音频。 | | *更多服务即将推出* | | 正在开发更多服务功能,并将添加到 AgentScope 以进一步增强其能力。 | 关于详细的参数、预期输入格式、返回类型,请参阅[API文档](https://modelscope.github.io/agentscope/)。 diff --git a/src/agentscope/service/__init__.py b/src/agentscope/service/__init__.py index f4af8e325..185da8a13 100644 --- a/src/agentscope/service/__init__.py +++ b/src/agentscope/service/__init__.py @@ -41,13 +41,9 @@ from .web.web_digest import digest_webpage, load_web, parse_html_to_text from .web.download import download_from_url -from .web.wiki import ( - wiki_get_category_members, - wiki_get_infobox, - wiki_get_page_content_by_paragraph, - wiki_get_all_wikipedia_tables, - wiki_get_page_images_with_captions, - wiki_page_retrieval +from .web.wikipedia import ( + wikipedia_search, + wikipedia_search_categories, ) @@ -95,11 +91,8 @@ def get_help() -> None: "dashscope_image_to_text", "dashscope_text_to_image", "dashscope_text_to_audio", - "wiki_get_category_members", - "wiki_get_infobox", - "wiki_get_page_content_by_paragraph", - "wiki_get_all_wikipedia_tables", - "wiki_get_page_images_with_captions", + "wikipedia_search", + "wikipedia_search_categories", # to be deprecated "ServiceFactory", ] diff --git a/src/agentscope/service/web/wiki.py b/src/agentscope/service/web/wiki.py deleted file mode 100644 index 9dc9f10c8..000000000 --- a/src/agentscope/service/web/wiki.py +++ /dev/null @@ -1,635 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Search contents from WikiPedia, -including texts, categories, infotable, table,... -""" - -import re - - -from bs4 import BeautifulSoup - -from agentscope.service.service_response import ( - ServiceResponse, - ServiceExecStatus, -) -from agentscope.utils.common import requests_get - - -def _wiki_api(params: dict) -> dict: - """Scratch information via Wiki API""" - url = "https://en.wikipedia.org/w/api.php" - return requests_get(url, params=params) - - -def _check_entity_existence(entity: str) -> ServiceResponse: - """ - Function to check if the eneity exists in Wikipedia - If yes, continue searching; - if not, return top 5 similar entities - - Args: - entity (str): searching keywords - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR - and return top-5 similar entities in `content`. - If entity exists, `status`=SUCCESS, return the original entity in `content`. - - Example 1 (entity exists): - .. code-block:: python - - _check_entity_existence('Hello') - - It returns: - .. code-block:: python - - { - 'status': , - 'content': { - 'entity': 'Hello' - } - } - - Example 2 (entity does not exist): - .. code-block:: python - - _check_entity_existence('nihao') - - It returns: - .. code-block:: python - - { - 'status': , - 'content': { - 'similar_entities': [ - 'Ni Hao', - 'Ranma ½', - 'Ni Hao, Kai-Lan', - 'List of Ranma ½ episodes', - 'Studio Deen' - ] - } - } - - """ - search_params = { - "action": "query", - "list": "search", - "srsearch": entity, - "format": "json", - } - - search_data = _wiki_api(search_params) - - if "query" in search_data and "search" in search_data["query"]: - if search_data["query"]["search"]: - exact_match = None - for result in search_data["query"]["search"]: - if result["title"].lower() == entity.lower(): - exact_match = result["title"] - break - if not exact_match: - similar_entities = [ - result["title"] - for result in search_data["query"]["search"][:5] - ] - return ServiceResponse( - ServiceExecStatus.ERROR, - {"similar_entities": similar_entities}, - ) - return ServiceResponse( - ServiceExecStatus.SUCCESS, - {"entity": exact_match}, - ) - return ServiceResponse( - ServiceExecStatus.ERROR, - {"error": "Entity not found"}, - ) - - -def wiki_get_category_members( - entity: str, - max_members: int = 1000, - limit_per_request: int = 500, -) -> ServiceResponse: - """Function to retrieve category members from Wikipedia:Category pages - - Args: - entity (str): searching keywords - max_members (int): maximum number of members to output - limit_per_request (int): number of members retrieved per request - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR - and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, - and return `content` as a list of dicts. - Keys of each dict: - - "pageid": unique page ID for the member - - "ns": namespace for the member, - indicating if the corresponding page is Article/User/... - - "title": title of the member - - Example: - - .. code-block:: python - - members = wiki_get_category_members( - "Machine_learning", - max_members=10 - ) - print(members) - - It returns contents: - - .. code-block:: python - - { - 'status': , - 'content': [ - {'pageid': 67911196, 'ns': 0, - 'title': 'Bayesian learning mechanisms'}, - {'pageid': 233488, 'ns': 0, 'title': 'Machine learning'}, - ...] - - } - - """ - params = { - "action": "query", - "list": "categorymembers", - "cmtitle": f"Category:{entity}", - "cmlimit": limit_per_request, # Maximum number of results per request - "format": "json", - } - - members = [] - total_fetched = 0 - - while total_fetched < max_members: - data = _wiki_api(params) - batch_members = data["query"]["categorymembers"] - members.extend(batch_members) - total_fetched += len(batch_members) - - # Check if there is a continuation token - if "continue" in data and total_fetched < max_members: - params["cmcontinue"] = data["continue"]["cmcontinue"] - else: - break - - # If more members were fetched than max_members, trim the list - if len(members) > max_members: - members = members[:max_members] - - if len(members) > 0: - return ServiceResponse(ServiceExecStatus.SUCCESS, members) - - return ServiceResponse(ServiceExecStatus.ERROR, members) - - -def wiki_get_infobox( - entity: str, -) -> ServiceResponse: - """ - Function to retrieve InfoBox from the WikiPedia page - - Args: - entity (str): searching keywords - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR, - and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, - and return `content` as a dict containing information in the InfoBox. - - Example: - - .. code-block:: python - - infobox_data = wiki_get_infobox(entity="Python (programming language)") - print(infobox_data) - - It returns content: - - .. code-block:: python - - { - 'status': , - 'content': {'Paradigm': 'Multi-paradigm : object-oriented ...', - 'Designed\xa0by': 'Guido van Rossum', - 'Developer': 'Python Software Foundation', - 'First\xa0appeared': '20\xa0February 1991 ...', - 'Stable release': '3.12.4 / 6 June 2024 ; ...', - 'Typing discipline': 'duck , dynamic , strong ; ...', - 'OS': 'Tier 1 : 64-bit Linux , macOS ; 。。。', - 'License': 'Python Software Foundation License', - 'Filename extensions': '.py, .pyw, .pyz, [10] .pyi, ...', - 'Website': 'python.org'} - } - """ - - existence_response = _check_entity_existence(entity) - if existence_response.status == ServiceExecStatus.ERROR: - return existence_response - - parse_params = { - "action": "parse", - "page": entity, - "prop": "text", - "format": "json", - } - - parse_data = _wiki_api(parse_params) - - if "parse" in parse_data: - raw_html = parse_data["parse"]["text"]["*"] - soup = BeautifulSoup(raw_html, "html.parser") - infobox = soup.find("table", {"class": "infobox"}) - - if not infobox: - return ServiceResponse(ServiceExecStatus.ERROR, None) - - infobox_data = {} - for row in infobox.find_all("tr"): - header = row.find("th") - value = row.find("td") - if header and value: - key = header.get_text(" ", strip=True) - val = value.get_text(" ", strip=True) - infobox_data[key] = val - - return ServiceResponse(ServiceExecStatus.SUCCESS, infobox_data) - error_message = parse_data.get("error", {}).get( - "info", - "Unknown error occurred", - ) - return ServiceResponse( - ServiceExecStatus.ERROR, - {"error": error_message}, - ) - - -def wiki_get_page_content_by_paragraph( - entity: str, - max_paragraphs: int = 1, -) -> ServiceResponse: - """ - Retrieve content from a Wikipedia page and split it into paragraphs, - excluding section headers. - - Args: - entity (str): search word. - max_paragraphs (int, optional): - The maximum number of paragraphs to retrieve. - Default is 1 (retrieve the first paragraph). - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR, - and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, - and return `content` as a list of paragraphs from the Wikipedia page. - - Example: - - .. code-block:: python - - wiki_paragraph = wiki_get_page_content_by_paragraph( - entity="Python (programming language)", - max_paragraphs=1) - print(wiki_paragraph) - - It will return content: - .. code-block:: python - - { - 'status': , - 'content': ['Python is a high-level...'] - } - - """ - existence_response = _check_entity_existence(entity) - if existence_response.status == ServiceExecStatus.ERROR: - return existence_response - - params = { - "action": "query", - "prop": "extracts", - "explaintext": True, - "titles": entity, - "format": "json", - } - - data = _wiki_api(params) - page = next(iter(data["query"]["pages"].values())) - content = page.get("extract", "No content found.") - if content == "No content found.": - return ServiceResponse(ServiceExecStatus.ERROR, content) - - # Split content into paragraphs, including headers - paragraphs = [ - para.strip() - for para in content.split("\n\n") - if para.strip() != "" - ] - - # Return the specified number of paragraphs - if max_paragraphs: - paragraphs = paragraphs[:max_paragraphs] - - return ServiceResponse(ServiceExecStatus.SUCCESS, paragraphs) - - -def wiki_get_all_wikipedia_tables( - entity: str, -) -> ServiceResponse: - """ - Retrieve tables on the Wikipedia page - - Args: - entity (str): search word. - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR, - and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, - and return `content` as a list of tables from the Wikipedia page. - Each table is presented as a dict, - where key is the column name and value is the values for each column. - - Example: - - .. code-block:: python - - wiki_table = wiki_get_all_wikipedia_tables( - entity="Python (programming language)" - ) - print(wiki_table) - - It will return content: - .. code-block:: python - - { - 'status': , - 'content': [ - { - 'Type': ['bool','bytearray',...], - 'Mutability': ['immutable','mutable',...], - ... - } - ] - } - - """ - existence_response = _check_entity_existence(entity) - if existence_response.status == ServiceExecStatus.ERROR: - return existence_response - - params = { - "action": "parse", - "page": entity, - "prop": "text", - "format": "json", - } - - data = _wiki_api(params) - raw_html = data["parse"]["text"]["*"] - - soup = BeautifulSoup(raw_html, "html.parser") - tables = soup.find_all("table", {"class": "wikitable"}) - - if not tables: - return ServiceResponse(ServiceExecStatus.ERROR, None) - - all_tables_data = [] - for _, table in enumerate(tables): - headers = [ - header.get_text(strip=True) for header in table.find_all("th") - ] - table_dict = {header: [] for header in headers} - - for row in table.find_all("tr")[1:]: # Skip the header row - cells = row.find_all(["td", "th"]) - if len(cells) == len( - headers, - ): # Ensure the row has the correct number of cells - for i, cell in enumerate(cells): - table_dict[headers[i]].append( - cell.get_text(strip=True), - ) - - all_tables_data.append(table_dict) - - return ServiceResponse(ServiceExecStatus.SUCCESS, all_tables_data) - - -def wiki_get_page_images_with_captions( - entity: str, -) -> ServiceResponse: - """ - Function to retrive images and details on the Wikipedia page - - Args: - entity (str): search word. - - Returns: - `ServiceResponse`: A dictionary containing `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR, - and return top-5 similar entities in `content`. - If the entity exists, `status`=SUCCESS, - and return the `content` as a list of dict from the Wikipedia page. - - Each dict has: - 'title': title of the image - 'url': link to the image - 'caption': caption of the image - - Example: - .. code-block:: python - - wiki_images = wiki_get_page_images_with_captions( - entity="Python (programming language)" - ) - print(wiki_images) - - It will return: - - .. code-block:: python - - { - 'status': , - 'content': [{ - 'title': 'File:Commons-logo.svg', - 'url': 'https://upload.wikimedia.org...', - 'caption': 'The Wikimedia Commons logo,...'}, - ... - ] - } - """ - - existence_response = _check_entity_existence(entity) - if existence_response.status == ServiceExecStatus.ERROR: - return existence_response - - params = { - "action": "query", - "prop": "images", - "titles": entity, - "format": "json", - } - data = _wiki_api(params) - page = next(iter(data["query"]["pages"].values())) - images = page.get("images", []) - if len(images) == 0: - return ServiceResponse(ServiceExecStatus.ERROR, None) - - image_details = [] - for image in images: - image_title = image["title"] - params = { - "action": "query", - "titles": image_title, - "prop": "imageinfo", - "iiprop": "url|extmetadata", - "format": "json", - } - data = _wiki_api(params) - image_page = next(iter(data["query"]["pages"].values())) - if "imageinfo" in image_page: - image_info = image_page["imageinfo"][0] - image_url = image_info.get("url", "") - extmetadata = image_info.get("extmetadata", {}) - caption = extmetadata.get("ImageDescription", {}).get( - "value", - "No caption available", - ) - image_details.append( - { - "title": image_title, - "url": image_url, - "caption": caption, - }, - ) - - return ServiceResponse(ServiceExecStatus.SUCCESS, image_details) - - -def wiki_page_retrieval( - entity: str, - max_paragraphs: int = 1, -)-> ServiceResponse: - """ - Function to retrive different format - (infobox, paragraphs, tables, images) - of information on the Wikipedia page - - Args: - entity (str): search word. - max_paragraphs (int, optional): - The maximum number of paragraphs to retrieve. - Default is 1 (retrieve the first paragraph). - - Returns: - A dictionary contains retrieved information of different format. - Keys are four formats: `infobox`, `paragraph`, `table`, `image`. - The value for each key is a `ServiceResponse` object containing - `status` and `content`. - The `status` attribute is from the ServiceExecStatus enum, - indicating success or error. - If the entity does not exist, `status`=ERROR, - otherwise `status`=SUCCESS. - The `content` attribute is the retrieved contents - if `status`=SUCCESS. Contents are different for each format. - `infobox`: Information in the InfoBox. - `paragraph`: A list of paragraphs from the Wikipedia page. The number - of paragraphs is determined by arg `max_paragraphs`. - `table`: A list of tables from the Wikipedia page. Each table - is presented as a dict, where key is the - column name and value is the values for each column. - `image`: A list of dict from the Wikipedia page. - Each dict has: - 'title': title of the image - 'url': link to the image - 'caption': caption of the image - - Example: - .. code-block:: python - - wiki_page_retrieval(entity='Hello', max_paragraphs=1) - - It will return: - - .. code-block:: python - - { - 'infobox': { - 'status': , - 'content': None - }, - 'paragraph': { - 'status': , - 'content': ['Hello is a salutation or greeting in the English language. It is first attested in writing from 1826.'] - }, - 'table': { - 'status': , - 'content': None - }, - 'image': { - 'status': , - 'content': [ - { - 'title': 'File:Semi-protection-shackle.svg', - 'url': 'https://upload.wikimedia.org/wikipedia/en/1/1b/Semi-protection-shackle.svg', - 'caption': '

English: Semi-protection lock with grey shackle\n

' - }, - { - 'title': 'File:TelephoneHelloNellie.jpg', - 'url': 'https://upload.wikimedia.org/wikipedia/commons/b/b3/TelephoneHelloNellie.jpg', - 'caption': 'No caption available' - }, - { - 'title': 'File:Wiktionary-logo-en-v2.svg', - 'url': 'https://upload.wikimedia.org/wikipedia/commons/9/99/Wiktionary-logo-en-v2.svg', - 'caption': 'A logo derived from ...' - } - ] - } - } - - """ - - infobox_retrieval = wiki_get_infobox(entity=entity) - paragraph_retrieval = wiki_get_page_content_by_paragraph( - entity=entity, max_paragraphs=max_paragraphs) - table_retrieval = wiki_get_all_wikipedia_tables(entity=entity) - image_retrieval = wiki_get_page_images_with_captions(entity=entity) - - total_retrieval = { - 'infobox': infobox_retrieval, - 'paragraph': paragraph_retrieval, - 'table': table_retrieval, - 'image': image_retrieval - } - - return total_retrieval diff --git a/src/agentscope/service/web/wikipedia.py b/src/agentscope/service/web/wikipedia.py new file mode 100644 index 000000000..ea10a8f18 --- /dev/null +++ b/src/agentscope/service/web/wikipedia.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +""" +Search contents from WikiPedia +""" +import requests + +from ..service_response import ( + ServiceResponse, + ServiceExecStatus, +) + + +def wikipedia_search_categories( + query: str, + max_members: int = 1000, +) -> ServiceResponse: + """Retrieve categories from Wikipedia:Category pages. + + Args: + query (str): + The given searching keywords + max_members (int): + The maximum number of members to output + + Returns: + `ServiceResponse`: A response that contains the execution status and + returned content. In the returned content, the meanings of keys: + - "pageid": unique page ID for the member + - "ns": namespace for the member + - "title": title of the member + + Example: + + .. code-block:: python + + members = wiki_get_category_members( + "Machine_learning", + max_members=10 + ) + print(members) + + It returns contents: + + .. code-block:: python + + { + 'status': , + 'content': [ + { + 'pageid': 67911196, + 'ns': 0, + 'title': 'Bayesian learning mechanisms' + }, + { + 'pageid': 233488, + 'ns': 0, + 'title': 'Machine learning' + }, + # ... + ] + } + + """ + url = "https://en.wikipedia.org/w/api.php" + limit_per_request: int = 500 + params = { + "action": "query", + "list": "categorymembers", + "cmtitle": f"Category:{query}", + "cmlimit": limit_per_request, # Maximum number of results per request + "format": "json", + } + + members = [] + total_fetched = 0 + + try: + while total_fetched < max_members: + response = requests.get(url, params=params, timeout=20) + response.raise_for_status() + + data = response.json() + + batch_members = data["query"]["categorymembers"] + members.extend(batch_members) + total_fetched += len(batch_members) + + # Check if there is a continuation token + if "continue" in data and total_fetched < max_members: + params["cmcontinue"] = data["continue"]["cmcontinue"] + else: + break + + except Exception as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=str(e), + ) + + # If more members were fetched than max_members, trim the list + if len(members) > max_members: + members = members[:max_members] + + if len(members) > 0: + return ServiceResponse(ServiceExecStatus.SUCCESS, members) + + return ServiceResponse(ServiceExecStatus.ERROR, members) + + +def wikipedia_search( # pylint: disable=C0301 + query: str, +) -> ServiceResponse: + """Search the given query in Wikipedia. Note the returned text maybe related entities, which means you should adjust your query as needed and search again. + + Note the returned text maybe too long for some llm, it's recommended to + summarize the returned text first. + + Args: + query (`str`): + The searched query in wikipedia. + + Return: + `ServiceResponse`: A response that contains the execution status and + returned content. + """ # noqa + + url = "https://en.wikipedia.org/w/api.php" + params = { + "action": "query", + "titles": query, + "prop": "extracts", + "explaintext": True, + "format": "json", + } + try: + response = requests.get(url, params=params, timeout=20) + response.raise_for_status() + data = response.json() + + # Combine into a text + text = [] + for page in data["query"]["pages"].values(): + if "extract" in page: + text.append(page["extract"]) + else: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content="No content found", + ) + + content = "\n".join(text) + return ServiceResponse( + status=ServiceExecStatus.SUCCESS, + content=content, + ) + + except Exception as e: + return ServiceResponse( + status=ServiceExecStatus.ERROR, + content=str(e), + ) diff --git a/tests/wiki_test.py b/tests/wiki_test.py index 25390176f..1ed4fe375 100644 --- a/tests/wiki_test.py +++ b/tests/wiki_test.py @@ -3,23 +3,19 @@ import unittest from unittest.mock import Mock, patch, MagicMock -from agentscope.service import ServiceResponse from agentscope.service import ( - wiki_get_category_members, - wiki_get_infobox, - wiki_get_page_content_by_paragraph, - wiki_get_all_wikipedia_tables, - wiki_get_page_images_with_captions, - wiki_page_retrieval + wikipedia_search, + wikipedia_search_categories, + ServiceResponse, + ServiceExecStatus, ) -from agentscope.service.service_status import ServiceExecStatus -class TestWiki(unittest.TestCase): +class TestWikipedia(unittest.TestCase): """ExampleTest for a unit test.""" @patch("agentscope.utils.common.requests.get") - def test_wiki_get_category_members( + def test_wikipedia_search_categories( self, mock_get: MagicMock, ) -> None: @@ -52,8 +48,7 @@ def test_wiki_get_category_members( mock_get.return_value = mock_response test_entity = "Test" - max_members = 1 - limit_per_request = 100 + limit_per_request = 500 params = { "action": "query", "list": "categorymembers", @@ -62,14 +57,12 @@ def test_wiki_get_category_members( "format": "json", } - results = wiki_get_category_members( - entity=test_entity, - max_members=max_members, - limit_per_request=limit_per_request, - ) + results = wikipedia_search_categories(query=test_entity) + mock_get.assert_called_once_with( "https://en.wikipedia.org/w/api.php", params=params, + timeout=20, ) self.assertEqual( @@ -78,426 +71,42 @@ def test_wiki_get_category_members( ) @patch("agentscope.utils.common.requests.get") - def test_wiki_get_infobox( - self, - mock_get: MagicMock, - ) -> None: - """Test get_infobox with different parameters and responses""" - - # Mock responses for search query - mock_response_search = Mock() - mock_dict_search = { - "query": { - "search": [ - {"title": "Test"}, - ], - }, - } - - # Mock responses for parse query - mock_response_parse = Mock() - mock_dict_parse = { - "parse": { - "title": "Test", - "pageid": 20, - "text": { - "*": """ - - - - - - - - - -
Column1Data1
Column2Data2
- """, - }, - }, - } - - expected_result = ServiceResponse( - status=ServiceExecStatus.SUCCESS, - content={ - "Column1": "Data1", - "Column2": "Data2", - }, - ) - - mock_response_search.json.return_value = mock_dict_search - mock_response_parse.json.return_value = mock_dict_parse - mock_get.side_effect = [mock_response_search, mock_response_parse] - - test_entity = "Test" - - results = wiki_get_infobox(entity=test_entity) - - # Define expected calls - calls = [ - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params={ - "action": "query", - "list": "search", - "srsearch": test_entity, - "format": "json", - }, - ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params={ - "action": "parse", - "page": test_entity, - "prop": "text", - "format": "json", - }, - ), - ] - - mock_get.assert_has_calls(calls, any_order=True) - - self.assertEqual(results, expected_result) - - @patch("agentscope.utils.common.requests.get") - def test_wiki_get_page_content_by_paragraph( + def test_wikipedia_search( self, mock_get: MagicMock, ) -> None: """Test get_page_content_by_paragraph""" - # Mock responses for search query - mock_response_search = Mock() - mock_dict_search = { - "query": { - "search": [ - {"title": "Test"}, - ], - }, - } - # Mock responses for extract query - mock_response_extract = Mock() - mock_dict_extract = { - "query": { - "pages": { - "20": { - "pageid": 20, - "title": "Test", - "extract": """ - This is the first paragraph. - - This is the second paragraph. - - == Section Header == - - This is the third paragraph under a section header. - """, - }, - }, - }, - } - - expected_result = ServiceResponse( - status=ServiceExecStatus.SUCCESS, - content=[ - "This is the first paragraph.", - "This is the second paragraph.", - ], - ) - - mock_response_search.json.return_value = mock_dict_search - mock_response_extract.json.return_value = mock_dict_extract - mock_get.side_effect = [mock_response_search, mock_response_extract] - - test_entity = "Test" - - results = wiki_get_page_content_by_paragraph( - entity=test_entity, - max_paragraphs=2, - ) - - # Define expected calls - params1 = { - "action": "query", - "list": "search", - "srsearch": test_entity, - "format": "json", - } - params2 = { - "action": "query", - "prop": "extracts", - "explaintext": True, - "titles": test_entity, - "format": "json", - } - - calls = [ - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params1, - ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params2, - ), - ] - - mock_get.assert_has_calls(calls, any_order=True) - - self.assertEqual(results, expected_result) - - @patch("agentscope.utils.common.requests.get") - def test_wiki_get_all_wikipedia_tables( - self, - mock_get: MagicMock, - ) -> None: - """Test get_all_wikipedia_tables""" - - # Mock responses for search query - mock_response_search = Mock() - mock_dict_search = { - "query": { - "search": [ - {"title": "Test"}, - ], - }, - } - - # Mock responses for parse query - mock_response_parse = Mock() - mock_dict_parse = { - "parse": { - "title": "Test", - "pageid": 20, - "text": { - "*": """ - - - - - - - - - - - - - -
Header1Header2
Row1Col1Row1Col2
Row2Col1Row2Col2
- """, - }, - }, - } - - expected_result = ServiceResponse( - status=ServiceExecStatus.SUCCESS, - content=[ - { - "Header1": ["Row1Col1", "Row2Col1"], - "Header2": ["Row1Col2", "Row2Col2"], - }, - ], - ) - - mock_response_search.json.return_value = mock_dict_search - mock_response_parse.json.return_value = mock_dict_parse - mock_get.side_effect = [mock_response_search, mock_response_parse] - - test_entity = "Test" - - results = wiki_get_all_wikipedia_tables(entity=test_entity) - - # Define expected calls - params1 = { - "action": "query", - "list": "search", - "srsearch": test_entity, - "format": "json", - } - params2 = { - "action": "parse", - "page": test_entity, - "prop": "text", - "format": "json", - } - - calls = [ - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params1, - ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params2, - ), - ] - - mock_get.assert_has_calls(calls, any_order=True) - - self.assertEqual(results, expected_result) - - @patch("agentscope.utils.common.requests.get") - def test_get_page_images_with_captions( - self, - mock_get: MagicMock, - ) -> None: - """Test get_page_images_with_captions""" - - # Mock responses for search query - mock_response_search = Mock() - mock_dict_search = { - "query": { - "search": [ - {"title": "Test"}, - ], - }, - } - - # Mock responses for images query - mock_response_images = Mock() - mock_dict_images = { + mock_response = Mock() + mock_dict = { "query": { "pages": { "20": { "pageid": 20, "title": "Test", - "images": [ - {"title": "Image1"}, - {"title": "Image2"}, - ], + "extract": "This is the first paragraph.", }, - }, - }, - } - - # Mock responses for image details query - mock_response_image1 = Mock() - mock_dict_image1 = { - "query": { - "pages": { - "30": { + "21": { "pageid": 30, - "imageinfo": [ - { - "url": "http://example.com/image1.jpg", - "extmetadata": { - "ImageDescription": { - "value": "Caption for image 1", - }, - }, - }, - ], + "title": "Test", + "extract": "This is the second paragraph.", }, }, }, } - mock_response_image2 = Mock() - mock_dict_image2 = { - "query": { - "pages": { - "31": { - "pageid": 31, - "imageinfo": [ - { - "url": "http://example.com/image2.jpg", - "extmetadata": { - "ImageDescription": { - "value": "Caption for image 2", - }, - }, - }, - ], - }, - }, - }, - } + mock_response.json.return_value = mock_dict + mock_get.return_value = mock_response - expected_result = ServiceResponse( + expected_response = ServiceResponse( status=ServiceExecStatus.SUCCESS, - content=[ - { - "title": "Image1", - "url": "http://example.com/image1.jpg", - "caption": "Caption for image 1", - }, - { - "title": "Image2", - "url": "http://example.com/image2.jpg", - "caption": "Caption for image 2", - }, - ], - ) - - mock_response_search.json.return_value = mock_dict_search - mock_response_images.json.return_value = mock_dict_images - mock_response_image1.json.return_value = mock_dict_image1 - mock_response_image2.json.return_value = mock_dict_image2 - mock_get.side_effect = [ - mock_response_search, - mock_response_images, - mock_response_image1, - mock_response_image2, - ] - - test_entity = "Test" - - results = wiki_get_page_images_with_captions(entity=test_entity) - - # Define expected calls - params1 = { - "action": "query", - "list": "search", - "srsearch": test_entity, - "format": "json", - } - params2 = { - "action": "query", - "prop": "images", - "titles": test_entity, - "format": "json", - } - params3_image1 = { - "action": "query", - "titles": "Image1", - "prop": "imageinfo", - "iiprop": "url|extmetadata", - "format": "json", - } - params4_image2 = { - "action": "query", - "titles": "Image2", - "prop": "imageinfo", - "iiprop": "url|extmetadata", - "format": "json", - } - - calls = [ - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params1, - ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params2, + content=( + "This is the first paragraph.\n" + "This is the second paragraph." ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params3_image1, - ), - unittest.mock.call( - "https://en.wikipedia.org/w/api.php", - params=params4_image2, - ), - ] - - mock_get.assert_has_calls(calls, any_order=True) - - self.assertEqual(results, expected_result) - + ) + response = wikipedia_search("Test") -if __name__ == "__main__": - unittest.main() + self.assertEqual(expected_response, response) From 915622145ce3b1d673bc8194db7dda09cfd1d7f8 Mon Sep 17 00:00:00 2001 From: DavdGao Date: Tue, 13 Aug 2024 17:15:25 +0800 Subject: [PATCH 15/16] Withdraw to solve conflict --- .../en/source/tutorial/204-service.md | 66 +++++++++---------- .../zh_CN/source/tutorial/204-service.md | 4 +- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/docs/sphinx_doc/en/source/tutorial/204-service.md b/docs/sphinx_doc/en/source/tutorial/204-service.md index 5c9456dee..dad6fa3d9 100644 --- a/docs/sphinx_doc/en/source/tutorial/204-service.md +++ b/docs/sphinx_doc/en/source/tutorial/204-service.md @@ -12,40 +12,38 @@ AgentScope and how to use them to enhance the capabilities of your agents. The following table outlines the various Service functions by type. These functions can be called using `agentscope.service.{function_name}`. -| Service Scene | Service Function Name | Description | -|-----------------------------|--------------------------------|----------------------------------------------------------------------------------------------------------------| -| Code | `execute_python_code` | Execute a piece of Python code, optionally inside a Docker container. | -| Retrieval | `retrieve_from_list` | Retrieve a specific item from a list based on given criteria. | -| | `cos_sim` | Compute the cosine similarity between two different embeddings. | -| SQL Query | `query_mysql` | Execute SQL queries on a MySQL database and return results. | -| | `query_sqlite` | Execute SQL queries on a SQLite database and return results. | -| | `query_mongodb` | Perform queries or operations on a MongoDB collection. | -| Text Processing | `summarization` | Summarize a piece of text using a large language model to highlight its main points. | -| Web | `bing_search` | Perform bing search | -| | `google_search` | Perform google search | -| | `arxiv_search` | Perform arXiv search | -| | `download_from_url` | Download file from given URL. | -| | `load_web` | Load and parse the web page of the specified url (currently only supports HTML). | -| | `digest_webpage` | Digest the content of a already loaded web page (currently only supports HTML). -| | `dblp_search_publications` | Search publications in the DBLP database -| | `dblp_search_authors` | Search for author information in the DBLP database | -| | `dblp_search_venues` | Search for venue information in the DBLP database | -| | `wikipedia_search` | Search for the given query in Wikipedia API | -| | `wikipedia_search_categories` | Search categories for the given query in Wikipedia:Category pages. | -| File | `create_file` | Create a new file at a specified path, optionally with initial content. | -| | `delete_file` | Delete a file specified by a file path. | -| | `move_file` | Move or rename a file from one path to another. | -| | `create_directory` | Create a new directory at a specified path. | -| | `delete_directory` | Delete a directory and all its contents. | -| | `move_directory` | Move or rename a directory from one path to another. | -| | `read_text_file` | Read and return the content of a text file. | -| | `write_text_file` | Write text content to a file at a specified path. | -| | `read_json_file` | Read and parse the content of a JSON file. | -| | `write_json_file` | Serialize a Python object to JSON and write to a file. | -| Multi Modality | `dashscope_text_to_image` | Convert text to image using Dashscope API. | -| | `dashscope_image_to_text` | Convert image to text using Dashscope API. | -| | `dashscope_text_to_audio` | Convert text to audio using Dashscope API. | -| *More services coming soon* | | More service functions are in development and will be added to AgentScope to further enhance its capabilities. | +| Service Scene | Service Function Name | Description | +|-----------------------------|----------------------------|----------------------------------------------------------------------------------------------------------------| +| Code | `execute_python_code` | Execute a piece of Python code, optionally inside a Docker container. | +| Retrieval | `retrieve_from_list` | Retrieve a specific item from a list based on given criteria. | +| | `cos_sim` | Compute the cosine similarity between two different embeddings. | +| SQL Query | `query_mysql` | Execute SQL queries on a MySQL database and return results. | +| | `query_sqlite` | Execute SQL queries on a SQLite database and return results. | +| | `query_mongodb` | Perform queries or operations on a MongoDB collection. | +| Text Processing | `summarization` | Summarize a piece of text using a large language model to highlight its main points. | +| Web | `bing_search` | Perform bing search | +| | `google_search` | Perform google search | +| | `arxiv_search` | Perform arXiv search | +| | `download_from_url` | Download file from given URL. | +| | `load_web` | Load and parse the web page of the specified url (currently only supports HTML). | +| | `digest_webpage` | Digest the content of a already loaded web page (currently only supports HTML). +| | `dblp_search_publications` | Search publications in the DBLP database +| | `dblp_search_authors` | Search for author information in the DBLP database | +| | `dblp_search_venues` | Search for venue information in the DBLP database | +| File | `create_file` | Create a new file at a specified path, optionally with initial content. | +| | `delete_file` | Delete a file specified by a file path. | +| | `move_file` | Move or rename a file from one path to another. | +| | `create_directory` | Create a new directory at a specified path. | +| | `delete_directory` | Delete a directory and all its contents. | +| | `move_directory` | Move or rename a directory from one path to another. | +| | `read_text_file` | Read and return the content of a text file. | +| | `write_text_file` | Write text content to a file at a specified path. | +| | `read_json_file` | Read and parse the content of a JSON file. | +| | `write_json_file` | Serialize a Python object to JSON and write to a file. | +| Multi Modality | `dashscope_text_to_image` | Convert text to image using Dashscope API. | +| | `dashscope_image_to_text` | Convert image to text using Dashscope API. | +| | `dashscope_text_to_audio` | Convert text to audio using Dashscope API. | +| *More services coming soon* | | More service functions are in development and will be added to AgentScope to further enhance its capabilities. | About each service function, you can find detailed information in the [API document](https://modelscope.github.io/agentscope/). diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md b/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md index 85e4174fb..788d2bdad 100644 --- a/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md +++ b/docs/sphinx_doc/zh_CN/source/tutorial/204-service.md @@ -27,8 +27,6 @@ | | `dblp_search_publications` | 在dblp数据库里搜索文献。 | | `dblp_search_authors` | 在dblp数据库里搜索作者。 | | | `dblp_search_venues` | 在dblp数据库里搜索期刊,会议及研讨会。 | -| | `wikipedia_search` | 在Wikipedia中进行搜索。 | -| | `wikipedia_search_categories` | 在Wikipedia的Category中搜索相关的category。 | | 文件处理 | `create_file` | 在指定路径创建一个新文件,并可选择添加初始内容。 | | | `delete_file` | 删除由文件路径指定的文件。 | | | `move_file` | 将文件从一个路径移动或重命名到另一个路径。 | @@ -41,7 +39,7 @@ | | `write_json_file` | 将 Python 对象序列化为 JSON 并写入到文件。 | | 多模态 | `dashscope_text_to_image` | 使用 DashScope API 将文本生成图片。 | | | `dashscope_image_to_text` | 使用 DashScope API 根据图片生成文字。 | -| | `dashscope_text_to_audio` | 使用 DashScope API 根据文本生成音频。 | +| | `dashscope_text_to_audio` | 使用 DashScope API 根据文本生成音频。 | | *更多服务即将推出* | | 正在开发更多服务功能,并将添加到 AgentScope 以进一步增强其能力。 | 关于详细的参数、预期输入格式、返回类型,请参阅[API文档](https://modelscope.github.io/agentscope/)。 From bbcfe661853664f50e80e03ca34ee077c1677f3f Mon Sep 17 00:00:00 2001 From: DavdGao Date: Tue, 13 Aug 2024 17:27:25 +0800 Subject: [PATCH 16/16] remove the unnecessary change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 29158f34b..8966431d4 100644 --- a/setup.py +++ b/setup.py @@ -30,9 +30,9 @@ "docker", "pymongo", "pymysql", - "feedparser", "bs4", "beautifulsoup4", + "feedparser", ] doc_requires = [