Skip to content

Commit

Permalink
feat: add EduInfoFileField, EduStructureNode.gather_leafs_of_type
Browse files Browse the repository at this point in the history
… and move regexes

refactor: rename `_gather_structure` to `gather_structure`, `_convert_structure_to_dataframe` to `_convert_course_structure_to_dataframe_recursively` and `get_course_structure` to `convert_course_structure_to_dataframe`
feat: now `convert_course_structure_to_dataframe` accepts a course structure instead of a course path
  • Loading branch information
GirZ0n committed Jan 29, 2024
1 parent a66c203 commit cc52015
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 67 deletions.
30 changes: 29 additions & 1 deletion jba/src/models/edu_structure.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
import re
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import List, Optional
from typing import List, Optional, Dict, Tuple

from core.src.utils.file.extension_utils import AnalysisExtension

INFO_FILE_REGEX = re.compile(f'([a-z]+)-info{AnalysisExtension.YAML.value}')
REMOTE_INFO_FILE_REGEX = re.compile(f'([a-z]+)-remote-info{AnalysisExtension.YAML.value}')


class EduInfoFileField(Enum):
ID = 'id'
CONTENT = 'content'
FILES = 'files'
VISIBLE = 'visible'
TYPE = 'type'
NAME = 'name'


class EduStructureType(Enum):
Expand All @@ -18,6 +33,19 @@ class EduStructureNode:
structure_type: EduStructureType
children: Optional[List['EduStructureNode']]

def gather_leafs_of_type(self, leaf_type: EduStructureType) -> Dict[Tuple[str], 'EduStructureNode']:
if self.structure_type == leaf_type:
return {(self.name,): self}

if self.children is None:
return {}

return {
(self.name, *path): leaf
for child in self.children
for path, leaf in child.gather_leafs_of_type(leaf_type).items()
}


@dataclass(frozen=True)
class EduLesson:
Expand Down
50 changes: 23 additions & 27 deletions jba/src/processing/collect_course_structure.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import re
from os import listdir

import argparse
from pathlib import Path

import argparse
import pandas as pd

from core.src.utils.df_utils import write_df
from core.src.utils.file.extension_utils import AnalysisExtension
from core.src.utils.file.file_utils import find_files_by_regex
from core.src.utils.file.yaml_utils import read_yaml_field_content
from jba.src.models.edu_columns import (
EduColumnName,
Expand All @@ -16,26 +14,24 @@
ID_COLUMN_POSTFIX,
NAME_COLUMN_POSTFIX,
)
from jba.src.models.edu_structure import EduStructureNode, EduStructureType

CONTENT_META_FIELD = 'content'
ID_META_FIELD = 'id'

INFO_FILE_REGEX = re.compile(f'([a-z]+)-info{AnalysisExtension.YAML.value}')
REMOTE_INFO_FILE_REGEX = re.compile(f'([a-z]+)-remote-info{AnalysisExtension.YAML.value}')

from jba.src.models.edu_structure import (
EduStructureNode,
EduStructureType,
EduInfoFileField,
INFO_FILE_REGEX,
REMOTE_INFO_FILE_REGEX,
)

def _gather_structure(root: Path) -> EduStructureNode: # noqa: WPS238
file_names = listdir(root)

info_files = list(filter(lambda file_name: re.match(INFO_FILE_REGEX, file_name), file_names))
def gather_structure(root: Path) -> EduStructureNode: # noqa: WPS238
info_files = find_files_by_regex(root, INFO_FILE_REGEX)
if len(info_files) != 1:
raise ValueError(f'The number of info files in {root} must be exactly 1 (actual: {len(info_files)}).')

info_file = info_files[0]
info_file_structure_type = re.match(INFO_FILE_REGEX, info_file).group(1)

remote_info_files = list(filter(lambda file_name: re.match(REMOTE_INFO_FILE_REGEX, file_name), file_names))
remote_info_files = find_files_by_regex(root, REMOTE_INFO_FILE_REGEX)
if len(remote_info_files) != 1:
raise ValueError(
f'The number of remote info files in {root} must be exactly 1 (actual: {len(remote_info_files)}).',
Expand All @@ -49,22 +45,22 @@ def _gather_structure(root: Path) -> EduStructureNode: # noqa: WPS238

structure_type = EduStructureType(info_file_structure_type)

structure_id = read_yaml_field_content(root / remote_info_file, ID_META_FIELD)
structure_id = read_yaml_field_content(root / remote_info_file, EduInfoFileField.ID.value)
if structure_id is None:
raise ValueError(f'{root / remote_info_file} must contain the {ID_META_FIELD} field.')
raise ValueError(f'{root / remote_info_file} must contain the {EduInfoFileField.ID.value} field.')

children = None
content = read_yaml_field_content(root / info_file, CONTENT_META_FIELD)
content = read_yaml_field_content(root / info_file, EduInfoFileField.CONTENT.value)
if content is not None:
children = [_gather_structure(root / name) for name in content]
children = [gather_structure(root / name) for name in content]

if not all([node.structure_type == children[0].structure_type for node in children]):
raise ValueError(f'All children nodes inside {root} must have the same structure type.')

return EduStructureNode(structure_id, root.name, structure_type, children)


def _convert_structure_to_dataframe(structure: EduStructureNode) -> pd.DataFrame:
def _convert_course_structure_to_dataframe_recursively(structure: EduStructureNode) -> pd.DataFrame:
if structure.children is None:
# If node has no content, then it is a task node
return pd.DataFrame.from_dict(
Expand All @@ -73,7 +69,7 @@ def _convert_structure_to_dataframe(structure: EduStructureNode) -> pd.DataFrame

children_dfs = []
for i, node in enumerate(structure.children, start=1):
node_df = _convert_structure_to_dataframe(node)
node_df = _convert_course_structure_to_dataframe_recursively(node)
node_df[f'{node.structure_type.value}_{NUMBER_COLUMN_POSTFIX}'] = i
node_df[f'{node.structure_type.value}_{AMOUNT_COLUMN_POSTFIX}'] = len(structure.children)
children_dfs.append(node_df)
Expand All @@ -85,9 +81,8 @@ def _convert_structure_to_dataframe(structure: EduStructureNode) -> pd.DataFrame
return structure_df


def get_course_structure(course_root: Path) -> pd.DataFrame:
course_structure = _gather_structure(course_root)
course_structure_df = _convert_structure_to_dataframe(course_structure)
def convert_course_structure_to_dataframe(course_structure: EduStructureNode) -> pd.DataFrame:
course_structure_df = _convert_course_structure_to_dataframe_recursively(course_structure)

# Removing unnecessary column
course_structure_df.drop(
Expand Down Expand Up @@ -122,8 +117,9 @@ def main():

args = parser.parse_args()

course_structure = get_course_structure(args.course_sources_path)
write_df(course_structure, args.output_path)
course_structure = gather_structure(args.course_sources_path)
course_structure_df = convert_course_structure_to_dataframe(course_structure)
write_df(course_structure_df, args.output_path)


if __name__ == '__main__':
Expand Down
37 changes: 17 additions & 20 deletions jba/src/processing/tasktracker_content_collector.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
import re
from dataclasses import dataclass
from os import listdir
from pathlib import Path
from typing import List, Dict

import argparse
from pathlib import Path

from collect_course_structure import gather_structure
from core.src.utils.file.yaml_utils import read_yaml_field_content, save_as_yaml
from jba.src.models.edu_structure import EduStructureType, EduLesson, EduStructureNode
from collect_course_structure import INFO_FILE_REGEX, _gather_structure # noqa: WPS450
from typing import List, Dict

CONTENT_META_FIELD = 'content'
FILES_META_FIELD = 'files'
VISIBLE_META_FIELD = 'visible'
TYPE_META_FIELD = 'type'
NAME_META_FIELD = 'name'
from jba.src.models.edu_structure import (
EduStructureType,
EduLesson,
EduStructureNode,
EduInfoFileField,
INFO_FILE_REGEX,
)

CONTENT_FILE_NAME = 'task_content_default.yaml'

FRAMEWORK_TYPE = 'framework'

TASK_DIRECTORY_NAME = 'task'

EXTENSIONS = {'py': 'PYTHON', 'ipynb': 'JUPYTER', 'java': 'JAVA', 'kt': 'KOTLIN', 'cpp': 'CPP', 'csv': 'CSV'}


Expand Down Expand Up @@ -90,15 +87,15 @@ def get_files(root: Path, lesson: EduLesson) -> List[TaskTrackerFile]:
def get_task_files(root: Path, relative_path: Path, is_framework: bool):
info_file = get_info_file(root)

files = read_yaml_field_content(root / info_file, FILES_META_FIELD)
files = read_yaml_field_content(root / info_file, EduInfoFileField.FILES.value)
if files is None:
files = []
files = list(filter(lambda file: file[VISIBLE_META_FIELD], files))
files = list(filter(lambda file: file[EduInfoFileField.VISIBLE.value], files))

def get_filename(file_content: Dict) -> TaskTrackerFile:
if is_framework:
return TaskTrackerFile(relative_path / TASK_DIRECTORY_NAME / file_content[NAME_META_FIELD])
return TaskTrackerFile(relative_path / root.name / file_content[NAME_META_FIELD])
return TaskTrackerFile(relative_path / TASK_DIRECTORY_NAME / file_content[EduInfoFileField.NAME.value])
return TaskTrackerFile(relative_path / root.name / file_content[EduInfoFileField.NAME.value])

return list(map(get_filename, files))

Expand All @@ -112,14 +109,14 @@ def course_structure_to_lessons(root: Path, structure: EduStructureNode):
path = root / structure.name
if structure.structure_type == EduStructureType.LESSON:
info_file = get_info_file(path)
content = read_yaml_field_content(path / info_file, CONTENT_META_FIELD)
yaml_file_content = read_yaml_field_content(path / info_file, TYPE_META_FIELD)
content = read_yaml_field_content(path / info_file, EduInfoFileField.CONTENT.value)
yaml_file_content = read_yaml_field_content(path / info_file, EduInfoFileField.TYPE.value)
return EduLesson(path, yaml_file_content is not None and yaml_file_content == FRAMEWORK_TYPE, content)
return flatten([course_structure_to_lessons(path, node) for node in structure.children])


def get_yaml_content(course_root: Path) -> Dict:
course_structure = _gather_structure(course_root)
course_structure = gather_structure(course_root)
lessons = course_structure_to_lessons(course_root, course_structure)

files = set()
Expand Down
40 changes: 21 additions & 19 deletions jba/tests/processing/test_collect_course_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,18 @@
from tempfile import NamedTemporaryFile

import pytest
from pandas._testing import assert_frame_equal

from core.src.utils.df_utils import read_df
from core.src.utils.subprocess_runner import run_in_subprocess
from jba.src import MAIN_FOLDER
from jba.src.models.edu_structure import EduStructureNode, EduStructureType
from jba.src.models.edu_structure import EduStructureNode, EduStructureType, EduInfoFileField
from jba.src.processing.collect_course_structure import (
_gather_structure,
ID_META_FIELD,
_convert_structure_to_dataframe,
get_course_structure,
gather_structure,
_convert_course_structure_to_dataframe_recursively,
convert_course_structure_to_dataframe,
)
from jba.tests.processing import COLLECT_COURSE_STRUCTURE_FOLDER
from pandas._testing import assert_frame_equal


COURSE_WITH_SECTION_STRUCTURE = EduStructureNode(
1,
Expand Down Expand Up @@ -77,7 +75,7 @@

@pytest.mark.parametrize(('course_root', 'expected_structure'), GATHER_STRUCTURE_TEST_DATA)
def test_gather_structure(course_root: Path, expected_structure: EduStructureNode):
actual_structure = _gather_structure(course_root)
actual_structure = gather_structure(course_root)
assert actual_structure == expected_structure


Expand All @@ -94,7 +92,10 @@ def test_gather_structure(course_root: Path, expected_structure: EduStructureNod
COLLECT_COURSE_STRUCTURE_FOLDER / 'course_with_undefined_structure_type',
r'Unable to determine a structure type for .+\.',
),
(COLLECT_COURSE_STRUCTURE_FOLDER / 'course_with_incorrect_id_field', rf'.+ must contain the {ID_META_FIELD} field\.'),
(
COLLECT_COURSE_STRUCTURE_FOLDER / 'course_with_incorrect_id_field',
rf'.+ must contain the {EduInfoFileField.ID.value} field\.',
),
(
COLLECT_COURSE_STRUCTURE_FOLDER / 'course_with_inconsistent_children',
r'All children nodes inside .+ must have the same structure type\.',
Expand All @@ -109,10 +110,10 @@ def test_gather_structure(course_root: Path, expected_structure: EduStructureNod
@pytest.mark.parametrize(('course_root', 'expected_message'), GATHER_STRUCTURE_THROWS_TEST_DATA)
def test_gather_structure_throws(course_root: Path, expected_message: str):
with pytest.raises(ValueError, match=expected_message):
_gather_structure(course_root)
gather_structure(course_root)


CONVERT_STRUCTURE_TO_DATAFRAME_TEST_DATA = [
CONVERT_STRUCTURE_TO_DATAFRAME_RECURSIVELY_TEST_DATA = [
(
COURSE_WITH_SECTION_STRUCTURE,
COLLECT_COURSE_STRUCTURE_FOLDER / 'expected_course_with_section_df_structure.csv',
Expand Down Expand Up @@ -193,12 +194,12 @@ def test_gather_structure_throws(course_root: Path, expected_message: str):
]


@pytest.mark.parametrize(('structure', 'expected_df_path'), CONVERT_STRUCTURE_TO_DATAFRAME_TEST_DATA)
def test_convert_structure_to_dataframe(structure: EduStructureNode, expected_df_path: Path):
assert_frame_equal(_convert_structure_to_dataframe(structure), read_df(expected_df_path))
@pytest.mark.parametrize(('structure', 'expected_df_path'), CONVERT_STRUCTURE_TO_DATAFRAME_RECURSIVELY_TEST_DATA)
def test_convert_structure_to_dataframe_recursively(structure: EduStructureNode, expected_df_path: Path):
assert_frame_equal(_convert_course_structure_to_dataframe_recursively(structure), read_df(expected_df_path))


GET_COURSE_STRUCTURE_TEST_DATA = [
CONVERT_COURSE_STRUCTURE_TO_DATAFRAME_TEST_DATA = [
(
COLLECT_COURSE_STRUCTURE_FOLDER / 'course_with_section',
COLLECT_COURSE_STRUCTURE_FOLDER / 'expected_course_with_section.csv',
Expand All @@ -210,9 +211,10 @@ def test_convert_structure_to_dataframe(structure: EduStructureNode, expected_df
]


@pytest.mark.parametrize(('course_root', 'expected_structure_path'), GET_COURSE_STRUCTURE_TEST_DATA)
def test_get_course_structure(course_root: Path, expected_structure_path: Path):
assert_frame_equal(get_course_structure(course_root), read_df(expected_structure_path))
@pytest.mark.parametrize(('course_root', 'expected_structure_path'), CONVERT_COURSE_STRUCTURE_TO_DATAFRAME_TEST_DATA)
def test_convert_course_structure_to_dataframe(course_root: Path, expected_structure_path: Path):
course_structure = gather_structure(course_root)
assert_frame_equal(convert_course_structure_to_dataframe(course_structure), read_df(expected_structure_path))


def test_incorrect_arguments():
Expand All @@ -224,7 +226,7 @@ def test_incorrect_arguments():
assert 'error: the following arguments are required' in stderr


@pytest.mark.parametrize(('course_root', 'expected_structure_path'), GET_COURSE_STRUCTURE_TEST_DATA)
@pytest.mark.parametrize(('course_root', 'expected_structure_path'), CONVERT_COURSE_STRUCTURE_TO_DATAFRAME_TEST_DATA)
def test_correct_arguments(course_root: Path, expected_structure_path: Path):
with NamedTemporaryFile(suffix='.csv') as output_file:
stdout, stderr = run_in_subprocess(
Expand Down

0 comments on commit cc52015

Please sign in to comment.