From dd9c18167495cc918448df2f6956296c81ff3478 Mon Sep 17 00:00:00 2001 From: Dmitry Petrov Date: Mon, 23 Jan 2023 23:36:54 +0000 Subject: [PATCH] Try a 40K dataset (4x data) --- data/data.xml.dvc | 9 +++++---- dvc.lock | 38 +++++++++++++++++++------------------- params.yaml | 2 +- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/data/data.xml.dvc b/data/data.xml.dvc index 114d7c814..ca146c5a0 100644 --- a/data/data.xml.dvc +++ b/data/data.xml.dvc @@ -1,12 +1,13 @@ -md5: f66db4fac66a93d4feaa939b4506c3ab +md5: 25c4a84510a41557840c61692dd14c11 frozen: true deps: - path: get-started/data.xml repo: url: https://github.com/iterative/dataset-registry - rev_lock: 705bc71a0a13c47b9e5147a3524fafc41f8ac7fa + rev_lock: cf6481baf56f156aa0876709cc231aaf3f3a3c29 + rev: get-started-40K outs: -- md5: 22a1a2931c8370d3aeedd7183606fd7f - size: 14445097 +- md5: 4bd325a30d5f1d5ea1a451d98767ddde + size: 59918667 hash: md5 path: data.xml diff --git a/dvc.lock b/dvc.lock index 447fcd573..ee3b44ae3 100644 --- a/dvc.lock +++ b/dvc.lock @@ -5,8 +5,8 @@ stages: deps: - path: data/data.xml hash: md5 - md5: 22a1a2931c8370d3aeedd7183606fd7f - size: 14445097 + md5: 4bd325a30d5f1d5ea1a451d98767ddde + size: 59918667 - path: src/prepare.py hash: md5 md5: f54d670ac8a4f63206781fc31d1f2651 @@ -18,16 +18,16 @@ stages: outs: - path: data/prepared hash: md5 - md5: 153aad06d376b6595932470e459ef42a.dir - size: 8437363 + md5: f8934609be51496ee500f80eea539c6f.dir + size: 35339221 nfiles: 2 featurize: cmd: python src/featurization.py data/prepared data/features deps: - path: data/prepared hash: md5 - md5: 153aad06d376b6595932470e459ef42a.dir - size: 8437363 + md5: f8934609be51496ee500f80eea539c6f.dir + size: 35339221 nfiles: 2 - path: src/featurization.py hash: md5 @@ -35,21 +35,21 @@ stages: size: 4158 params: params.yaml: - featurize.max_features: 200 + featurize.max_features: 500 featurize.ngrams: 2 outs: - path: data/features hash: md5 - md5: 4281fdd8e973e3bbe5abc0ae10adebc7.dir - size: 2232588 + md5: c9308f114f6a8f06fb5ba2b40ea81678.dir + size: 12597137 nfiles: 2 train: cmd: python src/train.py data/features model.pkl deps: - path: data/features hash: md5 - md5: 4281fdd8e973e3bbe5abc0ae10adebc7.dir - size: 2232588 + md5: c9308f114f6a8f06fb5ba2b40ea81678.dir + size: 12597137 nfiles: 2 - path: src/train.py hash: md5 @@ -63,20 +63,20 @@ stages: outs: - path: model.pkl hash: md5 - md5: 46f38c08e3d5174e5e3fb8753994d38b - size: 1957931 + md5: b568c889ca6a5719632188daa0bfd513 + size: 3365545 evaluate: cmd: python src/evaluate.py model.pkl data/features deps: - path: data/features hash: md5 - md5: 4281fdd8e973e3bbe5abc0ae10adebc7.dir - size: 2232588 + md5: c9308f114f6a8f06fb5ba2b40ea81678.dir + size: 12597137 nfiles: 2 - path: model.pkl hash: md5 - md5: 46f38c08e3d5174e5e3fb8753994d38b - size: 1957931 + md5: b568c889ca6a5719632188daa0bfd513 + size: 3365545 - path: src/evaluate.py hash: md5 md5: a1a59f55636170fb56e0c6afd3e28fa4 @@ -84,6 +84,6 @@ stages: outs: - path: eval hash: md5 - md5: 0deddc7fb86151e1cb4684e93be58f70.dir - size: 1292365 + md5: 6fe98138454e84433ffaf097fc5cfd51.dir + size: 4964239 nfiles: 8 diff --git a/params.yaml b/params.yaml index 954cc22b5..b16b7b66f 100644 --- a/params.yaml +++ b/params.yaml @@ -3,7 +3,7 @@ prepare: seed: 20170428 featurize: - max_features: 200 + max_features: 500 ngrams: 2 train: