From df5a57435b93f6baebd0cce18a7bc284e9597e02 Mon Sep 17 00:00:00 2001 From: "R.Noto" Date: Sun, 22 Oct 2023 12:32:33 +0900 Subject: [PATCH 1/3] =?UTF-8?q?=E5=AD=A6=E9=83=A8=E5=90=8D=E3=81=AE?= =?UTF-8?q?=E3=83=91=E3=83=BC=E3=82=B9=E5=87=A6=E7=90=86=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/__init__.py | 0 src/models/__init__.py | 0 src/models/base_info.py | 4 +- src/models/faculty.py | 22 ++++++++ src/parser/base_info_parser.py | 42 ++------------- src/parser/faculty_parser.py | 34 ++++++++++++ src/parser/parser.py | 43 +++++++++++++++ tests/parser/files/~$single_sheet_file1.xlsx | Bin 0 -> 165 bytes tests/parser/files/~$single_sheet_file2.xlsx | Bin 0 -> 165 bytes tests/parser/single_sheet_test_base.py | 36 +++++++++++++ tests/parser/test_base_info_parser.py | 53 ++++++------------- tests/parser/test_faculty_parser.py | 46 ++++++++++++++++ 12 files changed, 203 insertions(+), 77 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/models/__init__.py create mode 100644 src/models/faculty.py create mode 100644 src/parser/faculty_parser.py create mode 100644 src/parser/parser.py create mode 100644 tests/parser/files/~$single_sheet_file1.xlsx create mode 100644 tests/parser/files/~$single_sheet_file2.xlsx create mode 100644 tests/parser/single_sheet_test_base.py create mode 100644 tests/parser/test_faculty_parser.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/base_info.py b/src/models/base_info.py index f0afa49..975271d 100644 --- a/src/models/base_info.py +++ b/src/models/base_info.py @@ -4,9 +4,9 @@ def __init__(self, school_code="", president="") -> None: self.__president: str = president @property - def school_code(self): + def school_code(self) -> str: return self.__school_code @property - def president(self): + def president(self) -> str: return self.__president diff --git a/src/models/faculty.py b/src/models/faculty.py new file mode 100644 index 0000000..555a76c --- /dev/null +++ b/src/models/faculty.py @@ -0,0 +1,22 @@ +class Departments: + def __init__(self, name: str) -> None: + self.__name: str = name + pass + + @property + def name(self) -> str: + return self.__name + + +class Faculty: + def __init__(self, name: str) -> None: + self.__name: str = name + pass + + @property + def name(self) -> str: + return self.__name + + @property + def departments(self) -> list[Departments]: + return [] diff --git a/src/parser/base_info_parser.py b/src/parser/base_info_parser.py index c8ac257..537cb8e 100644 --- a/src/parser/base_info_parser.py +++ b/src/parser/base_info_parser.py @@ -2,57 +2,25 @@ from openpyxl.worksheet.worksheet import Worksheet from openpyxl.cell.cell import Cell from src.models.base_info import BaseInfo +from src.parser.parser import Parser -class BaseInfoParser: - def __init__(self, sheet: Worksheet) -> None: - self.__sheet = sheet - +class BaseInfoParser(Parser): def parse(self) -> BaseInfo: """ 学校コードのセルを基準にデータを検索しパースする。 """ base_cell: Optional[Cell] = self._search_cell( "学校コード", - self.__sheet, + self._sheet, ) if not base_cell: raise ValueError("学校コードが見つかりませんでした。") return BaseInfo( - school_code=self.__sheet.cell( + school_code=self._sheet.cell( row=base_cell.row + 1, column=base_cell.column ).value, - president=self.__sheet.cell( + president=self._sheet.cell( row=base_cell.row + 1, column=base_cell.column + 1 ).value, ) - - def _search_cell(self, keyword: str, sheet: Worksheet) -> Optional[Cell]: - """ - keywordをシートから検索し、最初に見つけたセルを返す。 - 検索はA1→A2→B2→B1→A3→B3→C3→C2→C1 - のようにA1から(max_row, max_column)に直線を引くような方向で検索をする。 - - Args: - keyword (str): 検索するキーワード - sheet (Worksheet): 検索対象のシート - """ - - for i in range(sheet.max_column): - column = i + 1 - # columnを縦に検索 - for j in range(column - 1): - row = i + 1 - column = j + 1 - if sheet.cell(row=row, column=column).value == keyword: - return sheet.cell(row=row, column=column) - - # rowとcolumnが同じセルを検索 - if sheet.cell(row=column, column=column).value == keyword: - return sheet.cell(row=column, column=column) - - # rowを右から左に検索 - for j in reversed(range(column - 1)): - row = j + 1 - if sheet.cell(row=row, column=column).value == keyword: - return sheet.cell(row=row, column=column) diff --git a/src/parser/faculty_parser.py b/src/parser/faculty_parser.py new file mode 100644 index 0000000..fffa6a9 --- /dev/null +++ b/src/parser/faculty_parser.py @@ -0,0 +1,34 @@ +from typing import Optional +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.cell.cell import Cell +from src.models.faculty import Faculty +from src.parser.parser import Parser + + +class FacultyParser(Parser): + def parse(self) -> list[Faculty]: + """ + 学部のセルを基準にデータを検索しパースする。 + """ + base_cell: Optional[Cell] = self._search_cell( + "学部", + self._sheet, + ) + if not base_cell: + raise ValueError('"学部"の文字が見つかりませんでした。') + + row = base_cell.row + 3 # 学部の3行下から開始 + column = base_cell.column + faculties: list[Faculty] = [] + # 空白のセルが見つかるまで、下を検索 + while self._sheet.cell(row=row, column=column).value not in [None, ""]: + faculty = self._sheet.cell(row=row, column=column).value + + # 初めて見つけた学科の場合は新規追加 + if faculty not in [f.name for f in faculties]: + faculties.append( + Faculty(self._sheet.cell(row=row, column=column).value) + ) + row += 1 + + return faculties diff --git a/src/parser/parser.py b/src/parser/parser.py new file mode 100644 index 0000000..876293c --- /dev/null +++ b/src/parser/parser.py @@ -0,0 +1,43 @@ +from abc import ABC, abstractmethod +from openpyxl.worksheet.worksheet import Worksheet +from openpyxl.cell.cell import Cell +from typing import Optional + + +class Parser(ABC): + def __init__(self, sheet: Worksheet) -> None: + self._sheet = sheet + + @abstractmethod + def parse(): + pass + + def _search_cell(self, keyword: str, sheet: Worksheet) -> Optional[Cell]: + """ + keywordをシートから検索し、最初に見つけたセルを返す。 + 検索はA1→A2→B2→B1→A3→B3→C3→C2→C1 + のようにA1から(max_row, max_column)に直線を引くような方向で検索をする。 + + Args: + keyword (str): 検索するキーワード + sheet (Worksheet): 検索対象のシート + """ + # 行か列の大きい方でループ + for i in range(max([sheet.max_column, sheet.max_row])): + column = i + 1 + # columnを縦に検索 + for j in range(column - 1): + row = i + 1 + column = j + 1 + if sheet.cell(row=row, column=column).value == keyword: + return sheet.cell(row=row, column=column) + + # rowとcolumnが同じセルを検索 + if sheet.cell(row=column, column=column).value == keyword: + return sheet.cell(row=column, column=column) + + # rowを右から左に検索 + for j in reversed(range(column - 1)): + row = j + 1 + if sheet.cell(row=row, column=column).value == keyword: + return sheet.cell(row=row, column=column) diff --git a/tests/parser/files/~$single_sheet_file1.xlsx b/tests/parser/files/~$single_sheet_file1.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e7b313b8889ec07aa32c26be93469241453e482d GIT binary patch literal 165 gcmZPz()05V@mC-burUNN=rQ;)_%nn6u>y@D03y2&jQ{`u literal 0 HcmV?d00001 diff --git a/tests/parser/files/~$single_sheet_file2.xlsx b/tests/parser/files/~$single_sheet_file2.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e7b313b8889ec07aa32c26be93469241453e482d GIT binary patch literal 165 gcmZPz()05V@mC-burUNN=rQ;)_%nn6u>y@D03y2&jQ{`u literal 0 HcmV?d00001 diff --git a/tests/parser/single_sheet_test_base.py b/tests/parser/single_sheet_test_base.py new file mode 100644 index 0000000..676b2fe --- /dev/null +++ b/tests/parser/single_sheet_test_base.py @@ -0,0 +1,36 @@ +from abc import ABC, abstractmethod +import openpyxl +from openpyxl.worksheet.worksheet import Worksheet +import os +import pytest +from typing import Generic, TypeVar + +from src.parser.parser import Parser + +T = TypeVar("T") + + +class SingleSheetTestBase(ABC, Generic[T]): + EXCEL_FILE_PATH_1 = f"{os.path.dirname(__file__)}/files/single_sheet_file1.xlsx" + EXCEL_FILE_PATH_2 = f"{os.path.dirname(__file__)}/files/single_sheet_file2.xlsx" + __book: openpyxl.Workbook + + def setup_method(self, method): + pass + + def teardown_method(self, method): + if self.__book: + self.__book.close() + + def get_sheet(self, path) -> Worksheet: + self.__book = openpyxl.load_workbook(path) + return self.__book[self.__book.sheetnames[0]] + + def parse(self, path: str) -> T: + sheet = self.get_sheet(path) + parser = self.get_parser(sheet) + return parser.parse() + + @abstractmethod + def get_parser(self, sheet: Worksheet) -> Parser: + pass diff --git a/tests/parser/test_base_info_parser.py b/tests/parser/test_base_info_parser.py index 485bdff..c0f3c7a 100644 --- a/tests/parser/test_base_info_parser.py +++ b/tests/parser/test_base_info_parser.py @@ -1,56 +1,33 @@ -import openpyxl from openpyxl.worksheet.worksheet import Worksheet -import os import pytest from src.models.base_info import BaseInfo from src.parser.base_info_parser import BaseInfoParser +from src.parser.parser import Parser +from tests.parser.single_sheet_test_base import SingleSheetTestBase -class TestBaseInfoParser: - EXCEL_FILE_PATH_1 = f"{os.path.dirname(__file__)}/files/single_sheet_file1.xlsx" - EXCEL_FILE_PATH_2 = f"{os.path.dirname(__file__)}/files/single_sheet_file2.xlsx" - __book: openpyxl.Workbook - - @pytest.fixture - def wrap(self): - self.set_up() - yield - self.tear_down() - - def set_up(self): - pass - - def tear_down(self): - if self.__book: - self.__book.close() - - def get_sheet(self, path) -> Worksheet: - self.__book = openpyxl.load_workbook(path) - return self.__book[self.__book.sheetnames[0]] - - def parse(self, path: str) -> BaseInfo: - sheet = self.get_sheet(path) - parser = BaseInfoParser(sheet) - return parser.parse() +class TestBaseInfoParser(SingleSheetTestBase[BaseInfo]): + def get_parser(self, sheet: Worksheet) -> Parser: + return BaseInfoParser(sheet) @pytest.mark.parametrize( - "path, expect", + "path, exp", [ - (EXCEL_FILE_PATH_1, "F101110100010"), - (EXCEL_FILE_PATH_2, "F123310106522"), + (SingleSheetTestBase.EXCEL_FILE_PATH_1, "F101110100010"), + (SingleSheetTestBase.EXCEL_FILE_PATH_2, "F123310106522"), ], ) - def test_should_parse_school_code(self, path: str, expect: str): + def test_should_parse_school_code(self, path: str, exp: str): base_info = self.parse(path) - assert base_info.school_code == expect + assert base_info.school_code == exp @pytest.mark.parametrize( - "path, expect", + "path, exp", [ - (EXCEL_FILE_PATH_1, "寳金 清博"), - (EXCEL_FILE_PATH_2, "景山 節"), + (SingleSheetTestBase.EXCEL_FILE_PATH_1, "寳金 清博"), + (SingleSheetTestBase.EXCEL_FILE_PATH_2, "景山 節"), ], ) - def test_should_parse_president(self, path: str, expect: str): + def test_should_parse_president(self, path: str, exp: str): base_info = self.parse(path) - assert base_info.president == expect + assert base_info.president == exp diff --git a/tests/parser/test_faculty_parser.py b/tests/parser/test_faculty_parser.py new file mode 100644 index 0000000..947e287 --- /dev/null +++ b/tests/parser/test_faculty_parser.py @@ -0,0 +1,46 @@ +from openpyxl.worksheet.worksheet import Worksheet +import pytest +from src.models.faculty import Faculty +from src.parser.faculty_parser import FacultyParser +from src.parser.parser import Parser +from tests.parser.single_sheet_test_base import SingleSheetTestBase + + +class TestFacultyParser(SingleSheetTestBase[list[Faculty]]): + def get_parser(self, sheet: Worksheet) -> Parser: + return FacultyParser(sheet) + + @pytest.mark.parametrize( + "path, exp", + [ + ( + SingleSheetTestBase.EXCEL_FILE_PATH_1, + [ + "文学部", + "教育学部", + "法学部", + "経済学部", + "理学部", + "医学部", + "歯学部", + "薬学部", + "工学部", + "農学部", + "獣医学部", + "水産学部", + ], + ), + ( + SingleSheetTestBase.EXCEL_FILE_PATH_2, + [ + "情報メディア学部", + "健康生活学部", + ], + ), + ], + ) + def test_should_parse_faculty(self, path: str, exp: list[str]): + faculties = self.parse(path) + for faculty in faculties: + assert faculty.name in exp + exp.remove(faculty.name) From 0893bdff67eb9e2870f4ac5cd2cff4e94feb64af Mon Sep 17 00:00:00 2001 From: "R.Noto" Date: Sun, 22 Oct 2023 12:34:48 +0900 Subject: [PATCH 2/3] =?UTF-8?q?xlsx=E3=82=AA=E3=83=BC=E3=83=97=E3=83=B3?= =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E3=81=AE=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 68bc17f..ef0f9e4 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +~*.xlsx From 209d2b6a6f0047f76f1fb8ce7e98d3a334c81737 Mon Sep 17 00:00:00 2001 From: "R.Noto" Date: Sun, 22 Oct 2023 15:15:50 +0900 Subject: [PATCH 3/3] =?UTF-8?q?=E5=AD=A6=E7=A7=91=E3=81=AE=E3=83=91?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E5=87=A6=E7=90=86=E4=BD=9C=E6=88=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/models/faculty.py | 10 ++-- src/parser/faculty_parser.py | 23 ++++++-- tests/parser/test_faculty_parser.py | 89 ++++++++++++++++++++++++++++- 3 files changed, 112 insertions(+), 10 deletions(-) diff --git a/src/models/faculty.py b/src/models/faculty.py index 555a76c..d703eed 100644 --- a/src/models/faculty.py +++ b/src/models/faculty.py @@ -1,4 +1,4 @@ -class Departments: +class Department: def __init__(self, name: str) -> None: self.__name: str = name pass @@ -9,14 +9,14 @@ def name(self) -> str: class Faculty: - def __init__(self, name: str) -> None: + def __init__(self, name: str, departments: list[Department]) -> None: self.__name: str = name - pass + self.__departments: list[Department] = departments @property def name(self) -> str: return self.__name @property - def departments(self) -> list[Departments]: - return [] + def departments(self) -> list[Department]: + return self.__departments diff --git a/src/parser/faculty_parser.py b/src/parser/faculty_parser.py index fffa6a9..3797328 100644 --- a/src/parser/faculty_parser.py +++ b/src/parser/faculty_parser.py @@ -1,7 +1,7 @@ from typing import Optional from openpyxl.worksheet.worksheet import Worksheet from openpyxl.cell.cell import Cell -from src.models.faculty import Faculty +from src.models.faculty import Department, Faculty from src.parser.parser import Parser @@ -22,13 +22,28 @@ def parse(self) -> list[Faculty]: faculties: list[Faculty] = [] # 空白のセルが見つかるまで、下を検索 while self._sheet.cell(row=row, column=column).value not in [None, ""]: - faculty = self._sheet.cell(row=row, column=column).value + faculty_name = self._sheet.cell(row=row, column=column).value # 初めて見つけた学科の場合は新規追加 - if faculty not in [f.name for f in faculties]: + if faculty_name not in [f.name for f in faculties]: faculties.append( - Faculty(self._sheet.cell(row=row, column=column).value) + Faculty( + self._sheet.cell(row=row, column=column).value, + [ + Department( + self._sheet.cell(row=row, column=column + 2).value, + ) + ], + ) ) + else: + for faculty in faculties: + if faculty.name == faculty_name: + faculty.departments.append( + Department( + self._sheet.cell(row=row, column=column + 2).value + ), + ), row += 1 return faculties diff --git a/tests/parser/test_faculty_parser.py b/tests/parser/test_faculty_parser.py index 947e287..de997e4 100644 --- a/tests/parser/test_faculty_parser.py +++ b/tests/parser/test_faculty_parser.py @@ -1,6 +1,6 @@ from openpyxl.worksheet.worksheet import Worksheet import pytest -from src.models.faculty import Faculty +from src.models.faculty import Department, Faculty from src.parser.faculty_parser import FacultyParser from src.parser.parser import Parser from tests.parser.single_sheet_test_base import SingleSheetTestBase @@ -44,3 +44,90 @@ def test_should_parse_faculty(self, path: str, exp: list[str]): for faculty in faculties: assert faculty.name in exp exp.remove(faculty.name) + + @pytest.mark.parametrize( + "path, exp", + [ + ( + SingleSheetTestBase.EXCEL_FILE_PATH_1, + { + "文学部": [ + "人文科学科", + ], + "教育学部": [ + "教育学科", + ], + "法学部": [ + "法学課程", + ], + "経済学部": [ + "経済学科", + "経営学科", + ], + "理学部": [ + "数学科", + "物理学科", + "化学科", + "生物科学科", + "地球惑星科学科", + ], + "医学部": [ + "医学科", + "保健学科", + ], + "歯学部": [ + "歯学科", + ], + "薬学部": [ + "薬科学科", + "薬学科", + ], + "工学部": [ + "応用理工系学科", + "情報エレクトロニクス学科", + "機械知能工学科", + "環境社会工学科", + "(共通)", + ], + "農学部": [ + "生物資源科学科", + "応用生命科学科", + "生物機能化学科", + "森林科学科", + "畜産科学科", + "生物環境工学科", + "農業経済学科", + ], + "獣医学部": [ + "共同獣医学課程", + ], + "水産学部": [ + "海洋生物科学科", + "海洋資源科学科", + "増殖生命科学科", + "資源機能化学科", + ], + }, + ), + ( + SingleSheetTestBase.EXCEL_FILE_PATH_2, + { + "情報メディア学部": [ + "情報メディア学科", + ], + "健康生活学部": [ + "健康栄養学科", + "フードビジネス学科", + ], + }, + ), + ], + ) + def test_should_parse_departments(self, path: str, exp: dict[str, list[str]]): + faculties = self.parse(path) + for faculty in faculties: + exp_departments: list[Department] = exp[faculty.name] + assert len(exp_departments) == len(faculty.departments) + for department in faculty.departments: + assert department.name in exp_departments + exp_departments.remove(department.name)