Skip to content

Commit

Permalink
Add Wiki retreval service (#324)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: DavdGao <[email protected]>
  • Loading branch information
PengfeiHePower and DavdGao authored Aug 14, 2024
1 parent c266df4 commit 01530ee
Show file tree
Hide file tree
Showing 5 changed files with 282 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ the following libraries.
- File Operation
- Text Processing
- Multi Modality
- Wikipedia search and retrieval

**Example Applications**

Expand Down
1 change: 1 addition & 0 deletions README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ AgentScope支持使用以下库快速部署本地模型服务。
- 文件操作
- 文本处理
- 多模态生成
- 维基百科搜索

**样例应用**

Expand Down
7 changes: 7 additions & 0 deletions src/agentscope/service/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@
from .web.web_digest import digest_webpage, load_web, parse_html_to_text
from .web.download import download_from_url

from .web.wikipedia import (
wikipedia_search,
wikipedia_search_categories,
)


def get_help() -> None:
"""Get help message."""
Expand Down Expand Up @@ -80,6 +85,8 @@ def get_help() -> None:
"bing_search",
"google_search",
"arxiv_search",
"wikipedia_search",
"wikipedia_search_categories",
"query_mysql",
"query_sqlite",
"query_mongodb",
Expand Down
161 changes: 161 additions & 0 deletions src/agentscope/service/web/wikipedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
Search contents from WikiPedia
"""
import requests

from ..service_response import (
ServiceResponse,
ServiceExecStatus,
)


def wikipedia_search_categories(
query: str,
max_members: int = 1000,
) -> ServiceResponse:
"""Retrieve categories from Wikipedia:Category pages.
Args:
query (str):
The given searching keywords
max_members (int):
The maximum number of members to output
Returns:
`ServiceResponse`: A response that contains the execution status and
returned content. In the returned content, the meanings of keys:
- "pageid": unique page ID for the member
- "ns": namespace for the member
- "title": title of the member
Example:
.. code-block:: python
members = wiki_get_category_members(
"Machine_learning",
max_members=10
)
print(members)
It returns contents:
.. code-block:: python
{
'status': <ServiceExecStatus.SUCCESS: 1>,
'content': [
{
'pageid': 67911196,
'ns': 0,
'title': 'Bayesian learning mechanisms'
},
{
'pageid': 233488,
'ns': 0,
'title': 'Machine learning'
},
# ...
]
}
"""
url = "https://en.wikipedia.org/w/api.php"
limit_per_request: int = 500
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Category:{query}",
"cmlimit": limit_per_request, # Maximum number of results per request
"format": "json",
}

members = []
total_fetched = 0

try:
while total_fetched < max_members:
response = requests.get(url, params=params, timeout=20)
response.raise_for_status()

data = response.json()

batch_members = data["query"]["categorymembers"]
members.extend(batch_members)
total_fetched += len(batch_members)

# Check if there is a continuation token
if "continue" in data and total_fetched < max_members:
params["cmcontinue"] = data["continue"]["cmcontinue"]
else:
break

except Exception as e:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content=str(e),
)

# If more members were fetched than max_members, trim the list
if len(members) > max_members:
members = members[:max_members]

if len(members) > 0:
return ServiceResponse(ServiceExecStatus.SUCCESS, members)

return ServiceResponse(ServiceExecStatus.ERROR, members)


def wikipedia_search( # pylint: disable=C0301
query: str,
) -> ServiceResponse:
"""Search the given query in Wikipedia. Note the returned text maybe related entities, which means you should adjust your query as needed and search again.
Note the returned text maybe too long for some llm, it's recommended to
summarize the returned text first.
Args:
query (`str`):
The searched query in wikipedia.
Return:
`ServiceResponse`: A response that contains the execution status and
returned content.
""" # noqa

url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"titles": query,
"prop": "extracts",
"explaintext": True,
"format": "json",
}
try:
response = requests.get(url, params=params, timeout=20)
response.raise_for_status()
data = response.json()

# Combine into a text
text = []
for page in data["query"]["pages"].values():
if "extract" in page:
text.append(page["extract"])
else:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content="No content found",
)

content = "\n".join(text)
return ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=content,
)

except Exception as e:
return ServiceResponse(
status=ServiceExecStatus.ERROR,
content=str(e),
)
112 changes: 112 additions & 0 deletions tests/wiki_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- coding: utf-8 -*-
"""Wiki retriever test."""
import unittest
from unittest.mock import Mock, patch, MagicMock

from agentscope.service import (
wikipedia_search,
wikipedia_search_categories,
ServiceResponse,
ServiceExecStatus,
)


class TestWikipedia(unittest.TestCase):
"""ExampleTest for a unit test."""

@patch("agentscope.utils.common.requests.get")
def test_wikipedia_search_categories(
self,
mock_get: MagicMock,
) -> None:
"""Test test_get_category_members"""
mock_response = Mock()
mock_dict = {
"query": {
"categorymembers": [
{
"pageid": 20,
"ns": 0,
"title": "This is a test",
},
],
},
}

expected_result = ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=[
{
"pageid": 20,
"ns": 0,
"title": "This is a test",
},
],
)

mock_response.json.return_value = mock_dict
mock_get.return_value = mock_response

test_entity = "Test"
limit_per_request = 500
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": f"Category:{test_entity}",
"cmlimit": limit_per_request,
"format": "json",
}

results = wikipedia_search_categories(query=test_entity)

mock_get.assert_called_once_with(
"https://en.wikipedia.org/w/api.php",
params=params,
timeout=20,
)

self.assertEqual(
results,
expected_result,
)

@patch("agentscope.utils.common.requests.get")
def test_wikipedia_search(
self,
mock_get: MagicMock,
) -> None:
"""Test get_page_content_by_paragraph"""

# Mock responses for extract query
mock_response = Mock()
mock_dict = {
"query": {
"pages": {
"20": {
"pageid": 20,
"title": "Test",
"extract": "This is the first paragraph.",
},
"21": {
"pageid": 30,
"title": "Test",
"extract": "This is the second paragraph.",
},
},
},
}

mock_response.json.return_value = mock_dict
mock_get.return_value = mock_response

expected_response = ServiceResponse(
status=ServiceExecStatus.SUCCESS,
content=(
"This is the first paragraph.\n"
"This is the second paragraph."
),
)

response = wikipedia_search("Test")

self.assertEqual(expected_response, response)

0 comments on commit 01530ee

Please sign in to comment.