Dev/rebuild sandbox (#332)

* sandbox rebuild v1 * switch * fix hpo 3sigma * after pre-commit * sandbox readme zh * finish doc * other_configs -> extra_configs * other_configs -> extra_configs * res_name -> meta_name * hooker -> hook * analyze -> analyse * after pre-commit * analyse -> analyze * analyser.py -> analyzer.py * analyser.py -> analyzer.py * analyser.py -> analyzer.py * regist -> register, DICT -> MAPPING
modelscope · Jul 17, 2024 · 0fdb97a · 0fdb97a
1 parent 9c7f316
commit 0fdb97a
Show file tree

Hide file tree

Showing 36 changed files with 797 additions and 489 deletions.
diff --git a/README.md b/README.md
@@ -276,17 +276,17 @@ python tools/process_data.py --config ./demos/process_video_on_ray/configs/demo.
 
 
 ### Data Analysis
-- Run `analyze_data.py` tool or `dj-analyze` command line tool with your config as the argument to analyse your dataset.
+- Run `analyze_data.py` tool or `dj-analyze` command line tool with your config as the argument to analyze your dataset.
 
 ```shell
 # only for installation from source
-python tools/analyze_data.py --config configs/demo/analyser.yaml
+python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # use command line tool
-dj-analyze --config configs/demo/analyser.yaml
+dj-analyze --config configs/demo/analyzer.yaml
 ```
 
-- **Note:** Analyser only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
+- **Note:** Analyzer only compute stats of Filter ops. So extra Mapper or Deduplicator ops will be ignored in the analysis process.
 
 ### Data Visualization
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -262,13 +262,13 @@ python tools/process_data.py --config ./demos/process_video_on_ray/configs/demo.
 
 ```shell
 # 适用于从源码安装
-python tools/analyze_data.py --config configs/demo/analyser.yaml
+python tools/analyze_data.py --config configs/demo/analyzer.yaml
 
 # 使用命令行工具
-dj-analyze --config configs/demo/analyser.yaml
+dj-analyze --config configs/demo/analyzer.yaml
 ```
 
-* **注意**：Analyser 只计算 Filter 算子的状态，其他的算子（例如 Mapper 和 Deduplicator）会在分析过程中被忽略。
+* **注意**：Analyzer 只计算 Filter 算子的状态，其他的算子（例如 Mapper 和 Deduplicator）会在分析过程中被忽略。
 
 ### 数据可视化
 

diff --git a/app.py b/app.py
@@ -18,7 +18,7 @@
 from data_juicer.analysis.diversity_analysis import (DiversityAnalysis,
                                                      get_diversity)
 from data_juicer.config import init_configs
-from data_juicer.core import Analyser, Executor
+from data_juicer.core import Analyzer, Executor
 from data_juicer.ops.base_op import OPERATORS
 from data_juicer.utils.constant import Fields, StatsKeys
 from data_juicer.utils.logger_utils import get_log_file_path
@@ -134,7 +134,7 @@ def analyze_and_show_res():
     cfg['save_stats_in_one_file'] = True
 
     logger.info('=========Stage 1: analyze original data=========')
-    analyzer = Analyser(cfg)
+    analyzer = Analyzer(cfg)
     dataset = analyzer.run()
 
     overall_file = os.path.join(analyzer.analysis_path, 'overall.csv')
@@ -171,7 +171,7 @@ def process_and_show_res():
 
         cfg_for_processed_data.export_path = os.path.dirname(
             cfg.export_path) + '_processed/data.jsonl'
-        analyzer = Analyser(cfg_for_processed_data)
+        analyzer = Analyzer(cfg_for_processed_data)
         analyzer.analysis_path = os.path.dirname(
             cfg_for_processed_data.export_path) + '/analysis'
         analyzer.run()
@@ -460,7 +460,7 @@ def diversity():
                                           max_value=100,
                                           step=1)
 
-                diversity_btn = st.button('Analyse_diversity',
+                diversity_btn = st.button('Analyze_diversity',
                                           use_container_width=True)
                 output_path = os.path.join(os.path.dirname(cfg.export_path),
                                            'analysis')

diff --git a/configs/README.md b/configs/README.md
@@ -7,7 +7,7 @@ This folder contains some configuration files to allow users to easily understan
 ```shell
 # To process your dataset.
 python tools/process_data.py --config xxx.yaml
-# To analyse your dataset.
+# To analyze your dataset.
 python tools/analyze_data.py --config xxx.yaml
 ```
 

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -40,19 +40,13 @@ executor_type: default                                      # type of executor,
 ray_address: auto                                           # the address of the Ray cluster.
 
 # only for data analysis
-percentiles: [0.25, 0.5, 0.75]                              # percentiles to analyse the dataset distribution
+percentiles: [0.25, 0.5, 0.75]                              # percentiles to analyze the dataset distribution
 export_original_dataset: false                              # whether to export the original dataset with stats. If you only need the stats of the dataset, setting it to false could speed up the exporting.
 save_stats_in_one_file: false                               # whether to store all stats result into one file
 
 # for sandbox or hpo
-model_infer_config: null                                    # path or dict to model inference configuration file when calling model executor in sandbox. Related hooks will be disabled if it's not specified.
-model_train_config: null                                    # path or dict to model training configuration file when calling model executor in sandbox. Related hooks will be disabled if it's not specified.
-model_eval_config: null                                     # path or dict to model evaluation configuration file when calling model executor in sandbox. Related hooks will be disabled if it's not specified.
-data_eval_config: null                                      # path or dict to data evaluation configuration file when calling model executor in sandbox. Related hooks will be disabled if it's not specified.
 data_probe_algo: 'uniform'                                  # sampling algorithm for dataset. Should be one of ["uniform", "frequency_specified_field_selector", "topk_specified_field_selector"]. It's "uniform" in default. Only used for dataset sampling.
 data_probe_ratio: 1.0                                       # the sampling ratio to the original dataset size. It's 1.0 in default. Only used for dataset sampling.
-path_k_sigma_recipe: null                                   # path to save a configuration file when using k-sigma tool to refine processing recipes
-path_model_feedback_recipe: null                            # path to save a configuration file refined by model feedback
 hpo_config: null                                            # path to a configuration file when using auto-HPO tool.
 
 

diff --git a/configs/demo/analyser.yaml b/configs/demo/analyser.yaml
@@ -1,11 +1,11 @@
 # Process config example for dataset
 
 # global parameters
-project_name: 'demo-analyser'
+project_name: 'demo-analyzer'
 dataset_path: './demos/data/demo-dataset.jsonl'  # path to your dataset directory or file
 np: 4  # number of subprocess to process your dataset
 
-export_path: './outputs/demo-analyser/demo-analyser-result.jsonl'
+export_path: './outputs/demo-analyzer/demo-analyzer-result.jsonl'
 
 # process schedule
 # a list of several process operators with their arguments

diff --git a/configs/demo/process.yaml b/configs/demo/process.yaml
@@ -12,3 +12,4 @@ export_path: './outputs/demo-process/demo-processed.jsonl'
 process:
   - language_id_score_filter:
       lang: 'zh'
+      min_score: 0.8
diff --git a/configs/demo/sandbox/gpt3_data_quality_eval_config.yaml b/configs/demo/sandbox/gpt3_data_quality_eval_config.yaml
@@ -1 +1,2 @@
 type: dj_text_quality_classifier
+dataset_path: './outputs/demo-process/demo-processed.jsonl'
diff --git a/configs/demo/sandbox/gpt3_extra_train_config.json b/configs/demo/sandbox/gpt3_extra_train_config.json
@@ -1,5 +1,7 @@
 {
   "type": "modelscope",
+  "dataset_path": "./outputs/demo-process/demo-processed.jsonl",
+  "work_dir": "./demos/data/",
   "model_name": "iic/nlp_gpt3_text-generation_chinese-base",
   "trainer_name": "nlp-base-trainer",
   "key_remapping": {

diff --git a/configs/demo/sandbox/gpt3_extra_train_config.yaml b/configs/demo/sandbox/gpt3_extra_train_config.yaml
@@ -1,4 +1,6 @@
 type: modelscope
+dataset_path: './outputs/demo-process/demo-processed.jsonl'
+work_dir: './demos/data/'
 model_name: "iic/nlp_gpt3_text-generation_chinese-base"
 trainer_name: "nlp-base-trainer"
 key_remapping:

diff --git a/configs/demo/sandbox/sandbox.yaml b/configs/demo/sandbox/sandbox.yaml
@@ -1,27 +1,68 @@
-# Sandbox config example for dataset
+# Sandbox config example
 
 # global parameters
 project_name: 'demo-sandbox'
-dataset_path: './demos/data/demo-dataset.jsonl'  # path to your dataset directory or file
-np: 4  # number of subprocess to process your dataset
+experiment_name: 'demo-sandbox-run0'              # for wandb tracer name
+hpo_config: null                                  # path to a configuration file when using auto-HPO tool.
 
-export_path: './outputs/demo-sandbox/demo-sandbox.jsonl'
+# configs for each job, the jobs will be executed according to the order in the list
+probe_job_configs:
+  - hook: 'ProbeViaAnalyzerHook'
+    meta_name: 'analysis_ori_data'
+    dj_configs: 'configs/demo/process.yaml'
+    extra_configs:
+  # - hook: 'ProbeViaModelInferHook'
+  #   meta_name: 'analysis_ori_model'
+  #   dj_configs:
+  #     dataset_path: './demos/data/demo-dataset.jsonl'
+  #     export_path: './outputs/demo-sandbox/demo-sandbox.jsonl'
+  #     data_probe_algo: 'uniform'
+  #     data_probe_ratio: 0.5
+  #     extra_configs:
+  #       (...model configs)
 
-# sandbox configs
-# for refining recipe using k-sigma rules
-path_k_sigma_recipe: './outputs/demo-sandbox/k_sigma_new_recipe.yaml'
+refine_recipe_job_configs:
+  - hook: 'RefineRecipeViaKSigmaHook'
+    meta_name: 'analysis_ori_data'
+    dj_configs: 'configs/demo/process.yaml'
+    extra_configs:
+      path_k_sigma_recipe: './outputs/demo-process/k_sigma_new_recipe.yaml'
+  # - hook: 'RefineRecipeViaModelFeedbackHook'
+  #   meta_name:
+  #   dj_configs:
+  #   extra_configs:
+  #     (...model configs)
 
-# for gpt3 quality classifier as data evaluator
-data_eval_config: 'configs/demo/sandbox/gpt3_data_quality_eval_config.yaml'
-#data_eval_config:
-#  type: dj_text_quality_classifier
+execution_job_configs:
+  - hook: 'ProcessDataHook'
+    meta_name:
+    dj_configs: './outputs/demo-process/k_sigma_new_recipe.yaml'
+    extra_configs:
+  - hook: 'TrainModelHook'
+    meta_name:
+    dj_configs:
+    extra_configs: 'configs/demo/sandbox/gpt3_extra_train_config.json'
 
-# for gpt3 model training
-model_train_config: 'configs/demo/sandbox/gpt3_extra_train_config.json'
-
-# process schedule
-# a list of several process operators with their arguments
-process:
-  - language_id_score_filter:
-      lang: 'zh'
-      min_score: 0.5
+evaluation_job_configs:
+  - hook: 'ProbeViaAnalyzerHook'
+    meta_name: 'analysis_processed_data'
+    dj_configs: 'configs/demo/process.yaml'
+    extra_configs:
+  # - hook: 'ProbeViaModelInferHook'
+  #   meta_name: 'analysis_trained_model'
+  #   dj_configs:
+  #     dataset_path: './demos/data/demo-dataset.jsonl'
+  #     export_path: './outputs/demo-sandbox/demo-sandbox.jsonl'
+  #     data_probe_algo: 'uniform'
+  #     data_probe_ratio: 0.5
+  #     extra_configs:
+  #       (...model configs)
+  - hook: 'EvaluateDataHook'
+    meta_name: 'eval_data'
+    dj_configs:
+    extra_configs: 'configs/demo/sandbox/gpt3_data_quality_eval_config.yaml'
+  # - hook: 'EvaluateModelHook'
+  #   meta_name: 'eval_model'
+  #   dj_configs:
+  #   oextra_configs:
+  #     (...model configs)
diff --git a/data_juicer/analysis/column_wise_analysis.py b/data_juicer/analysis/column_wise_analysis.py
@@ -62,7 +62,7 @@ def __init__(self,
         """
         Initialization method
 
-        :param dataset: the dataset to be analysed
+        :param dataset: the dataset to be analyzed
         :param output_path: path to store the analysis results
         :param overall_result: optional precomputed overall stats result
         :param save_stats_in_one_file: whether save all analysis figures of all
@@ -73,15 +73,15 @@ def __init__(self,
         if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)
 
-        # if no overall description provided, analyse it from scratch
+        # if no overall description provided, analyze it from scratch
         if overall_result is None:
             oa = OverallAnalysis(dataset, output_path)
-            overall_result = oa.analyse()
+            overall_result = oa.analyze()
         self.overall_result = overall_result
 
         self.save_stats_in_one_file = save_stats_in_one_file
 
-    def analyse(self, show_percentiles=False, show=False, skip_export=False):
+    def analyze(self, show_percentiles=False, show=False, skip_export=False):
         """
         Apply analysis and draw the analysis figure for stats.
 

diff --git a/data_juicer/analysis/diversity_analysis.py b/data_juicer/analysis/diversity_analysis.py
@@ -39,9 +39,9 @@ def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True):
     Find the verb and its object closest to the root of lexical tree of input
     string.
 
-    :param nlp: the diversity model to analyse the diversity strings
-    :param s: the string to be analysed
-    :param first_sent: whether to analyse the first sentence in the
+    :param nlp: the diversity model to analyze the diversity strings
+    :param s: the string to be analyzed
+    :param first_sent: whether to analyze the first sentence in the
         input string only. If it's true, return the analysis result of
         the first sentence no matter it's valid or not. If it's false,
         return the first valid result over all sentences
@@ -87,7 +87,7 @@ class DiversityAnalysis:
     result."""
 
     def __init__(self, dataset, output_path, lang_or_model='en'):
-        """Initialization method :param dataset: the dataset to be analysed
+        """Initialization method :param dataset: the dataset to be analyzed
         :param output_path: path to store the analysis results :param
         lang_or_model: the diversity model or a specific language used to load
         the diversity model."""
@@ -104,7 +104,7 @@ def compute(self, lang_or_model=None, column_name='text'):
 
         :param lang_or_model: the diversity model or a specific language
             used to load the diversity model
-        :param column_name: the name of column to be analysed
+        :param column_name: the name of column to be analyzed
         :return: the analysis result.
         """
         # load diversity model
@@ -129,7 +129,7 @@ def find_verb_noun(sample):
         dataset = self.dataset.map(find_verb_noun)
         return pd.DataFrame(dataset)
 
-    def analyse(self,
+    def analyze(self,
                 lang_or_model=None,
                 column_name='text',
                 postproc_func=get_diversity,
@@ -139,8 +139,8 @@ def analyse(self,
 
         :param lang_or_model: the diversity model or a specific language
             used to load the diversity model
-        :param column_name: the name of column to be analysed
-        :param postproc_func: function to analyse diversity. In default,
+        :param column_name: the name of column to be analyzed
+        :param postproc_func: function to analyze diversity. In default,
             it's function get_diversity
         :param postproc_kwarg: arguments of the postproc_func
         :return:

diff --git a/data_juicer/analysis/overall_analysis.py b/data_juicer/analysis/overall_analysis.py
@@ -21,17 +21,17 @@ def __init__(self, dataset, output_path):
         """
         Initialization method.
 
-        :param dataset: the dataset to be analysed
+        :param dataset: the dataset to be analyzed
         :param output_path: path to store the analysis results.
         """
         self.stats = pd.DataFrame(dataset[Fields.stats])
         self.output_path = output_path
         if not os.path.exists(self.output_path):
             os.makedirs(self.output_path)
 
-        # default percentiles to analyse
+        # default percentiles to analyze
         self.default_percentiles = [0.25, 0.5, 0.75]
-        # supported dtypes of column to be analysed
+        # supported dtypes of column to be analyzed
         # Notice: there won't be mixed types in a column because the stats is
         # obtained from Dataset, which doesn't allow mixed types.
         # Notice: for now, stats can only be:
@@ -48,7 +48,7 @@ def refine_single_column(self, col):
         if type(first) not in self.supported_object_types:
             logger.warning(f'There is a column of stats with type '
                            f'[{type(first)}], which is not supported to be '
-                           f'analysed for now.')
+                           f'analyzed for now.')
             return None
         if type(first) is str:
             # describe(include = 'all') can analyze the string type
@@ -58,13 +58,13 @@ def refine_single_column(self, col):
             col = col.explode().infer_objects()
             return col
 
-    def analyse(self, percentiles=[], num_proc=1, skip_export=False):
+    def analyze(self, percentiles=[], num_proc=1, skip_export=False):
         """
         Apply overall analysis on the whole dataset based on the describe
         method of pandas.
 
-        :param percentiles: percentiles to analyse
-        :param num_proc: number of processes to analyse the dataset
+        :param percentiles: percentiles to analyze
+        :param num_proc: number of processes to analyze the dataset
         :param skip_export: whether export the results to disk
         :return: the overall analysis result.
         """

diff --git a/data_juicer/config/__init__.py b/data_juicer/config/__init__.py
@@ -1,4 +1,5 @@
-from .config import export_config, init_configs, merge_config
+from .config import (export_config, get_init_configs, init_configs,
+                     merge_config, prepare_side_configs)
 
 __all__ = [
     'init_configs',