diff --git a/.gitignore b/.gitignore
index 18021663bc940..2451185f3ee6b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ test_*/
*.local
go.sum
*.pyc
+*.ezdraw
diff --git a/README.md b/README.md
index 124cb5d4c0c22..886e67c198911 100644
--- a/README.md
+++ b/README.md
@@ -1,66 +1,9 @@
# TiDB Lightning
-TiDB Lightning is a data import tool which is used to fast import a large amount of data to the TiDB cluster. Currently, it only supports source data in the Mydumper file format and in the future it will support more formats like CSV.
+**TiDB Lightning** is a tool for fast full import of large amounts of data into a TiDB cluster.
+Currently, we support reading SQL dump exported via mydumper.
-Now TiDB Lightning only supports full import of new tables. During the importing process, the cluster cannot provide services normally; as a result, TiDB Lightning is not suitable for importing data online.
+![](docs/en_US/tidb-lightning.svg)
-## TiDB Lightning architecture
-
-The following diagram shows the architecture of TiDB Lightning:
-
-![](media/tidb-lightning-architecture.png)
-
-One set of TiDB Lightning has two components:
-
-- `tidb-lightning`
-
- The front-end part of TiDB Lightning. It transforms the source data into Key-Value (KV) pairs and writes the data into `tikv-importer`.
-
-- `tikv-importer`
-
- The back-end part of TiDB Lightning. It caches, sorts, and splits the KV pairs written by `tidb-lightning` and imports the KV pairs to the TiKV cluster.
-
-## TiDB Lightning workflow
-
-1. Before importing data, `tidb-lightning` automatically switches the TiKV mode to the import mode via API.
-2. `tidb-lightning` obtains data from the data source, transforms the source data into KV data, and then writes the data into `tikv-importer`.
-3. When the data written by `tidb-lightning` reaches a specific size, `tidb-lightning` sends the `Import` command to `tikv-importer`.
-4. `tikv-importer` divides and schedules the TiKV data of the target cluster and then imports the data to the TiKV cluster.
-5. `tidb-lightning` transforms and imports the source data continuously until it finishes importing the data in the source data directory.
-6. `tidb-lightning` performs the `Compact`, `Checksum`, and `Analyze` operation on tables in the target cluster.
-7. `tidb-lightning` automatically switches the TiKV mode to the normal mode. Then the TiDB cluster can provide services normally.
-
-## Deploy process
-
-### Notes
-
-Before deploying TiDB Lightning, you should take note that:
-
-- When TiDB Lightning is running, the TiDB cluster cannot provide services normally.
-- When you import data using TiDB Lightning, you cannot check some source data constraints such as the primary key conflict and unique index conflict. If needed, you can check using `ADMIN CHECK TABLE` via the MySQL client after importing, but it may take a long time.
-- Currently, TiDB Lightning does not support breakpoint. If any error occurs during importing, delete the data from the target cluster using `DROP TABLE` and import the data again.
-- If TiDB Lightning exits abnormally, you need to use the `-swtich-mode` command line parameter of `tidb-lightning` to manually close the import mode of the TiKV cluster and change it to the normal mode:
-
- ```
- ./bin/tidb-lightning -switch-mode normal
- ```
-
-### Hardware requirements
-
-See [Hardware requirements of TiDB Lightning](docs/tidb-lightning-user-guide.md#hardware-requirements)
-
-### Prepare
-
-Before importing, you should:
-
-- Deploy a set of TiDB cluster (TiDB version is 2.0.4 or later) which is the target cluster for importing (the target cluster).
-- Prepare the binary file and the configuration file of `tikv-importer`. It is recommended to use standalone deployment.
-- Prepare the binary file and the configuration file of `tidb-lightning`. It is recommended to use standalone deployment.
-
-Download the installation packages of `tikv-importer` and `tidb-lightning` via:
-
-https://download.pingcap.org/tidb-lightning-latest-linux-amd64.tar.gz
-
-### Deploy
-
-See [TiDB Lightning User Guide](docs/tidb-lightning-user-guide.md#deploy)
+* [Detailed documentation](docs/en_US/README.md)
+* [简体中文文档](docs/zh_CN/README.md)
diff --git a/docs/en_US/01-Architecture.md b/docs/en_US/01-Architecture.md
new file mode 100644
index 0000000000000..0e3aea402cf60
--- /dev/null
+++ b/docs/en_US/01-Architecture.md
@@ -0,0 +1,37 @@
+Architecture
+============
+
+![Architecture of TiDB Lightning tool set](./tidb-lightning.svg)
+
+The TiDB Lightning tool set consists of two components:
+
+- **`tidb-lightning`** (the "front end") reads the SQL dump and import the database structure
+ into the TiDB cluster, and also transforms the data into Key-Value (KV) pairs
+ and sends them to `tikv-importer`.
+
+- **`tikv-importer`** (the "back end") combines and sorts the KV pairs and then
+ imports these sorted pairs as a whole into the TiKV cluster.
+
+The complete import process is like this:
+
+1. Before importing, `tidb-lightning` switches the TiKV cluster to "import mode", which optimizes
+ the cluster for writing and disables automatic compaction.
+
+2. `tidb-lightning` creates the skeleton of all tables from the data source.
+
+3. For each table, `tidb-lightning` informs `tikv-importer` via gRPC to create an *engine file*
+ to store KV pairs. `tidb-lightning` then reads the SQL dump in parallel, transforms the data
+ into KV pairs according to the TiDB rules, and send them to `tikv-importer`'s engine files.
+
+4. Once a full table of KV pairs are received, `tikv-importer` divides and schedules these data
+ and imports them into the target TiKV cluster.
+
+5. `tidb-lightning` then performs a checksum comparison between the local data source and
+ those calculated from the cluster, to ensure there is no data corruption in the process.
+
+6. After all tables are imported, `tidb-lightning` performs a global compaction on the TiKV
+ cluster, and tell TiDB to `ANALYZE` all imported tables, to prepare for optimal query planning.
+
+7. Finally, `tidb-lightning` switches the TiKV cluster back to "normal mode" so the cluster
+ resumes normal services.
+
diff --git a/docs/en_US/02-Deployment.md b/docs/en_US/02-Deployment.md
new file mode 100644
index 0000000000000..f8e9565c2b40f
--- /dev/null
+++ b/docs/en_US/02-Deployment.md
@@ -0,0 +1,339 @@
+Deployment and Execution
+========================
+
+Hardware requirements
+---------------------
+
+### Notes
+
+Before starting TiDB Lightning, note that:
+
+- During the import process, the cluster cannot provide normal services.
+- If `tidb-lightning` crashes, the cluster will be left in "import mode".
+ Forgetting to switch back to "normal mode" will lead to a high amount of uncompacted data on
+ the TiKV cluster, and will cause abnormally high CPU usage and stall.
+ You can manually switch the cluster back to "normal mode" via the `tidb-lightning-ctl` tool:
+
+ ```sh
+ bin/tidb-lightning-ctl -switch-mode=normal
+ ```
+
+### Deploying to separate machines
+
+`tidb-lightning` and `tikv-importer` are resource-intensive programs. It is recommended to deploy them into
+two dedicated machines.
+
+To achieve the best performance, it is recommended to use the following hardware configuration:
+
+- `tidb-lightning`
+
+ - 32+ logical cores CPU
+ - 16 GB+ memory
+ - 1 TB+ SSD, preferring higher read speed
+ - 10 Gigabit network card
+ - `tidb-lightning` fully consumes all CPU cores when running,
+ and deploying on a dedicated machine is highly recommended.
+ If not possible, `tidb-lightning` could be deployed together with other components like
+ `tidb-server`, and limiting the CPU usage via the `region-concurrency` setting.
+
+- `tikv-importer`
+
+ - 32+ logical cores CPU
+ - 32 GB+ memory
+ - 1 TB+ SSD, preferring higher IOPS
+ - 10 Gigabit network card
+ - `tikv-importer` fully consumes all CPU, disk I/O and network bandwidth when running,
+ and deploying on a dedicated machine is strongly recommended.
+ If not possible, `tikv-importer` could be deployed together with other components like
+ `tikv-server`, but the import speed might be affected.
+
+If you have got enough machines, you could deploy multiple Lightning/Importer servers,
+with each working on a distinct set of tables, to import the data in parallel.
+
+### Deploying to single machine
+
+If the hardware resources is severely under constraint, it is possible to deploy `tidb-lightning`
+and `tikv-importer` and other components on the same machine, but note that the import performance
+would also be impacted.
+
+We recommend the following configuration of the single machine:
+
+- 32+ logical cores CPU
+- 32 GB+ memory
+- 1 TB+ SSD, preferring higher IOPS
+- 10 Gigabit network card
+
+`tidb-lightning` is a CPU intensive program. In an environment with mixed components, the resources
+allocated to `tidb-lightning` must be limited. Otherwise, other components might not be able to run.
+We recommend setting the `region-concurrency` to 75% of CPU logical cores. For instance, if the CPU
+has 32 logical cores, the `region-concurrency` can be set to 24.
+
+Ansible deployment
+------------------
+
+TiDB Lightning can be deployed using Ansible, like [TiDB cluster itself][tidb-ansible].
+
+[tidb-ansible]: https://github.com/pingcap/docs/blob/master/op-guide/ansible-deployment.md
+
+1. Edit `inventory.ini` to provide the addresses of the `tidb-lightning` and `tikv-importer`
+ servers:
+
+ ```ini
+ ...
+
+ [importer_server]
+ 192.168.20.9
+
+ [lightning_server]
+ 192.168.20.10
+
+ ...
+ ```
+
+2. Configure these tools by editing the settings under `group_vars/*.yml`.
+
+ * `group_vars/all.yml`
+
+ ```yaml
+ ...
+ # The listening port of tikv-importer. Should be open to the tidb-lightning server.
+ tikv_importer_port: 20170
+ ...
+ ```
+
+ * `group_vars/lightning_server.yml`
+
+ ```yaml
+ ---
+ dummy:
+
+ # The listening port for metrics gathering. Should be open to the monitoring servers.
+ tidb_lightning_pprof_port: 10089
+
+ # The file path tidb-lightning reads the mydumper SQL dump from.
+ data_source_dir: "{{ deploy_dir }}/mydumper"
+ ```
+
+ * `group_vars/importer_server.yml`
+
+ ```yaml
+ ---
+ dummy:
+
+ # The file path to store engine files. Should reside on a partition with large capacity.
+ import_dir: "{{ deploy_dir }}/data.import"
+ ```
+
+3. Deploy the cluster via the usual steps
+
+ ```sh
+ ansible-playbook bootstrap.yml
+ ansible-playbook deploy.yml
+ ```
+
+4. Mount the data source to the path specified in the `data_source_dir` setting.
+
+5. Login to the `tikv-importer` server, and manually run
+
+ ```sh
+ scripts/start_importer.sh
+ ```
+
+ to start Importer.
+
+6. Login to the `tidb-lightning` server, and manually run the following command
+ to start Lightning and import the data into the TiDB cluster.
+
+ ```sh
+ scripts/start_lightning.sh
+ ```
+
+7. After completion, run `scripts/stop_importer.sh` on the `tikv-importer` server to stop Importer.
+
+Manual deployment
+-----------------
+
+### TiDB cluster
+
+Before importing, you should have deployed the TiDB cluster, with cluster version 2.0.4 or above.
+Using the latest version is highly recommended.
+
+You can find deployment instructions in the
+[TiDB Quick Start Guide](https://pingcap.com/docs/QUICKSTART/).
+
+Download the TiDB Lightning tool set (choose the one same as the cluster version):
+
+- **v2.1**: https://download.pingcap.org/tidb-lightning-release-2.1-linux-amd64.tar.gz
+- **v2.0**: https://download.pingcap.org/tidb-lightning-release-2.0-linux-amd64.tar.gz
+
+### Starting `tikv-importer`
+
+1. Upload `bin/tikv-importer` from the tool set.
+
+2. Configure `tikv-importer.toml`:
+
+ ```toml
+ # TiKV Importer configuration file template
+
+ # Log file
+ log-file = "tikv-importer.log"
+ # Log level: trace, debug, info, warn, error, off.
+ log-level = "info"
+
+ [server]
+ # Listening address of tikv-importer. tidb-lightning needs to connect to
+ # this address to write data.
+ addr = "0.0.0.0:20170"
+ # Size of thread pool for the gRPC server.
+ grpc-concurrency = 16
+
+ [metric]
+ # The Prometheus client push job name.
+ job = "tikv-importer"
+ # The Prometheus client push interval.
+ interval = "15s"
+ # The Prometheus Pushgateway address.
+ address = ""
+
+ [rocksdb]
+ # The maximum number of concurrent background jobs.
+ max-background-jobs = 32
+
+ [rocksdb.defaultcf]
+ # Amount of data to build up in memory before flushing data to the disk.
+ write-buffer-size = "1GB"
+ # The maximum number of write buffers that are built up in memory.
+ max-write-buffer-number = 8
+
+ # The compression algorithms used in different levels.
+ # The algorithm at level-0 is used to compress KV data.
+ # The algorithm at level-6 is used to compress SST files.
+ # The algorithms at level-1 to level-5 are unused for now.
+ compression-per-level = ["lz4", "no", "no", "no", "no", "no", "zstd"]
+
+ [import]
+ # The directory to store engine files.
+ import-dir = "/tmp/tikv/import"
+ # Number of threads to handle RPC requests.
+ num-threads = 16
+ # Number of concurrent import jobs.
+ num-import-jobs = 24
+ # Maximum duration to prepare regions.
+ #max-prepare-duration = "5m"
+ # Split regions into this size according to the importing data.
+ #region-split-size = "96MB"
+ # Stream channel window size, stream will be blocked on channel full.
+ #stream-channel-window = 128
+ # Maximum number of open engines.
+ max-open-engines = 8
+ ```
+
+3. Run `tikv-importer`.
+
+ ```sh
+ nohup ./tikv-importer -C tikv-importer.toml > nohup.out &
+ ```
+
+### Starting `tidb-lightning`
+
+1. Upload `bin/tidb-lightning` and `bin/tidb-lightning-ctl` from the tool set.
+
+2. Mount the mydumper SQL dump onto the same machine.
+
+3. Configure `tidb-lightning.toml`:
+
+ ```toml
+ ### tidb-lightning configuartion
+
+ [lightning]
+ # HTTP port for debugging and Prometheus metrics pulling (0 to disable)
+ pprof-port = 10089
+
+ # check if the cluster satisfies the minimum requirement before starting
+ #check-requirements = true
+
+ # The maximum number of tables to be handled concurrently.
+ # Must not exceed the max-open-engines setting for tikv-importer.
+ table-concurrency = 8
+ # The concurrency number of data. It is set to the number of logical CPU
+ # cores by default. When deploying together with other components, you can
+ # set it to 75% of the size of logical CPU cores to limit the CPU usage.
+ #region-concurrency =
+
+ # Logging
+ level = "info"
+ file = "tidb-lightning.log"
+ max-size = 128 # MB
+ max-days = 28
+ max-backups = 14
+
+ [checkpoint]
+ # Whether to enable checkpoints.
+ # While importing, Lightning records which tables have been imported, so
+ # even if Lightning or other component crashed, we could start from a known
+ # good state instead of redoing everything.
+ enable = true
+ # The schema name (database name) to store the checkpoints
+ schema = "tidb_lightning_checkpoint"
+ # The data source name (DSN) in the form "USER:PASS@tcp(HOST:PORT)/".
+ # If not specified, the TiDB server from the [tidb] section will be used to
+ # store the checkpoints. You could also specify a different MySQL-compatible
+ # database server to reduce the load of the target TiDB cluster.
+ #dsn = "root@tcp(127.0.0.1:4000)/"
+ # Whether to keep the checkpoints after all data are imported. If false, the
+ # checkpoints will be deleted. Keeping the checkpoints can aid debugging but
+ # will leak metadata about the data source.
+ #keep-after-success = false
+
+ [tikv-importer]
+ # The listening address of tikv-importer. Change it to the actual address
+ addr = "172.16.31.10:20170"
+
+ [mydumper]
+ # Block size for file reading. Should be longer than the longest string of
+ # the data source.
+ read-block-size = 4096 # Byte (default = 4 KB)
+ # Each data file will be split into multiple chunks of this size. Each chunk
+ # will be processed in parallel.
+ region-min-size = 268435456 # Byte (default = 256 MB)
+ # mydumper local source data directory
+ data-source-dir = "/data/my_database"
+ # if no-schema is set true, tidb-lightning will assume the table skeletons
+ # already exists on the target TiDB cluster, and will not execute the CREATE
+ # TABLE statements
+ no-schema = false
+
+ [tidb]
+ # Configuration of any one TiDB server from the cluster
+ host = "172.16.31.1"
+ port = 4000
+ user = "root"
+ password = ""
+ # Table schema information is fetched from TiDB via this status-port.
+ status-port = 10080
+ # Address of any one PD server from the cluster
+ pd-addr = "172.16.31.4:2379"
+ # tidb-lightning imports TiDB as a library and generates some logs itself.
+ # This setting controls the log level of the TiDB library.
+ log-level = "error"
+ # Sets TiDB session variable to speed up the Checksum and Analyze operations.
+ distsql-scan-concurrency = 16
+
+ # When data importing is complete, tidb-lightning can automatically perform
+ # the Checksum, Compact and Analyze operations. It is recommended to leave
+ # these as true in the production environment.
+ # The execution order: Checksum -> Compact -> Analyze
+ [post-restore]
+ # Performs `ADMIN CHECKSUM TABLE
` for each table to verify data integrity.
+ checksum = true
+ # Performs compaction on the TiKV cluster.
+ compact = true
+ # Performs `ANALYZE TABLE
` for each table.
+ analyze = true
+ ```
+
+4. Run `tidb-lightning`.
+
+ ```sh
+ nohup ./tidb-lightning -config tidb-lightning.toml > nohup.out &
+ ```
diff --git a/docs/en_US/03-Checkpoints.md b/docs/en_US/03-Checkpoints.md
new file mode 100644
index 0000000000000..625eeb21aa65a
--- /dev/null
+++ b/docs/en_US/03-Checkpoints.md
@@ -0,0 +1,105 @@
+Using Checkpoints
+=================
+
+Importing a large database usually takes hours and days, and if such long running processes
+spuriously crashes, it would be very time-wasting to redo the previously completed tasks. Lightning
+uses *checkpoints* to stores the import progress, so that restarting `tidb-lightning` will continue
+importing from where it lefts off.
+
+```toml
+[checkpoint]
+# Whether to enable checkpoints.
+# While importing, Lightning will record which tables have been imported, so
+# even if Lightning or other component crashed, we could start from a known
+# good state instead of redoing everything.
+enable = true
+
+# The schema name (database name) to store the checkpoints
+schema = "tidb_lightning_checkpoint"
+
+# The data source name (DSN) in the form "USER:PASS@tcp(HOST:PORT)/".
+# If not specified, the TiDB server from the [tidb] section will be used to
+# store the checkpoints. You could also specify a different MySQL-compatible
+# database server to reduce the load of the target TiDB cluster.
+#dsn = "root@tcp(127.0.0.1:4000)/"
+
+# Whether to keep the checkpoints after all data are imported. If false, the
+# checkpoints will be deleted. Keeping the checkpoints can aid debugging but
+# will leak metadata about the data source.
+#keep-after-success = false
+```
+
+Storage
+-------
+
+Checkpoints are saved in any databases compatible with MySQL 5.7 or above, including MariaDB and
+TiDB. By default the checkpoints are saved in the target database.
+
+While using the target database as the checkpoint storage, Lightning is importing large amount
+of data at the same time. This puts extra stress on the target database and sometimes leads
+to communication timeout. Therefore, **we strongly recommend you install a temporary MySQL server to
+store these checkpoints**. This server can be installed on the same host as `tidb-lightning` and can
+be uninstalled after the importer progress is completed.
+
+Checkpoint control
+------------------
+
+If `tidb-lightning` exits abnormally due to unrecoverable errors (e.g. data corruption), it will
+refuse to reuse the checkpoints until the errors are resolved. This is to prevent worsening the
+situation. The checkpoint errors can be resolved using the `tidb-lightning-ctl` program.
+
+### `--checkpoint-error-destroy`
+
+```sh
+tidb-lightning-ctl --checkpoint-error-destroy='`schema`.`table`
+```
+
+If importing the table `` `schema`.`table` `` failed previously, this
+
+1. DROPs the table `` `schema`.`table` `` from the target database, i.e. removing all imported data.
+2. resets the checkpoint record of this table to be "not yet started".
+
+If there is no errors involving the table `` `schema`.`table` ``, this operation does nothing.
+
+This option allows us to restarting importing the table from scratch. The schema and table names
+must be quoted by backquotes and is case-sensitive.
+
+```sh
+tidb-lightning-ctl --checkpoint-error-destroy=all
+```
+
+Same as applying the above on every table. This is the most convenient, safe and conservative
+solution to fix the checkpoint error problem.
+
+### `--checkpoint-error-ignore`
+
+```sh
+tidb-lightning-ctl --checkpoint-error-ignore='`schema`.`table`'
+tidb-lightning-ctl --checkpoint-error-ignore=all
+```
+
+If importing the table `` `schema`.`table` `` failed previously, this clears the error status as
+if nothing happened. The `all` variant applies this operation to all tables.
+
+This should only be used when you are sure that the error can indeed be ignored. If not, some
+imported data could be lost. The only safety net is the final "checksum" check, and thus the
+"checksum" option should always be enabled when using `--checkpoint-error-ignore`.
+
+### `--checkpoint-remove`
+
+```sh
+tidb-lightning-ctl --checkpoint-remove='`schema`.`table`'
+tidb-lightning-ctl --checkpoint-remove=all
+```
+
+Simply remove all checkpoint information about one table / all tables, regardless of their status.
+
+### `--checkpoint-dump`
+
+```sh
+tidb-lightning-ctl --checkpoint-dump=output/directory
+```
+
+Dumps the content of the checkpoint into the given directory. Mainly used for debugging by technical
+staff.
+
diff --git a/docs/en_US/04-Metrics.md b/docs/en_US/04-Metrics.md
new file mode 100644
index 0000000000000..42b853addb50b
--- /dev/null
+++ b/docs/en_US/04-Metrics.md
@@ -0,0 +1,124 @@
+Metrics
+=======
+
+Both `tidb-lightning` and `tikv-importer` supports metrics collection via
+[Prometheus](https://prometheus.io/).
+
+Configuration
+-------------
+
+If you installed Lightning via TiDB-Ansible, simply adding the servers to the `[monitored_servers]`
+section in the `inventory.ini` should be sufficient to let the Prometheus server to collect their
+metrics.
+
+If you installed Lightning manually, follow the instructions below.
+
+### `tikv-importer`
+
+`tikv-importer` v2.1 uses [Pushgateway](https://github.com/prometheus/pushgateway) to deliver
+metrics. Configure `tikv-importer.toml` to recognize the Pushgateway with the following settings:
+
+```toml
+[metric]
+
+# The Prometheus client push job name.
+job = "tikv-importer"
+
+# The Prometheus client push interval.
+interval = "15s"
+
+# The Prometheus Pushgateway address.
+address = ""
+```
+
+### `tidb-lightning`
+
+The metrics of `tidb-lightning` can be gathered directly by Prometheus as long as it is discovered.
+The metrics port can be set in `tidb-lightning.toml`
+
+```toml
+[lightning]
+# HTTP port for debugging and Prometheus metrics pulling (0 to disable)
+pprof-port = 10089
+
+...
+```
+
+Prometheus needs to be configured to discover the `tidb-lightning` server. For instance, you could
+hard-code the server address to the `scrape_configs` section:
+
+```yaml
+...
+scrape_configs:
+ - job_name: 'tidb-lightning'
+ static_configs:
+ - targets: ['192.168.20.10:10089']
+```
+
+Raw metrics
+-----------
+
+### `tikv-importer`
+
+Metrics provided by `tikv-importer` are listed under the namespace `tikv_import_*`.
+
+* **`tikv_import_rpc_duration`** (Histogram)
+
+ Bucketed histogram of import RPC duration. Labels:
+
+ * **request**: RPC name, e.g. `open_engine`, `import_engine`, etc.
+ * **result**: `ok` / `error`
+
+* **`tikv_import_write_chunk_bytes`** (Histogram)
+
+ Bucketed histogram of import write chunk bytes.
+
+* **`tikv_import_write_chunk_duration`** (Histogram)
+
+ Bucketed histogram of import write chunk duration.
+
+* **`tikv_import_upload_chunk_bytes`** (Histogram)
+
+ Bucketed histogram of import upload chunk bytes.
+
+* **`tikv_import_upload_chunk_duration`** (Histogram)
+
+ Bucketed histogram of import upload chunk duration.
+
+### `tidb-lightning`
+
+Metrics provided by `tidb-lightning` are listed under the namespace `lightning_*`.
+
+* **`lightning_importer_engine`** (Counter)
+
+ Counting open and closed engine files. Labels:
+
+ * **type**: `open` / `closed`
+
+* **`lightning_idle_workers`** (Gauge)
+
+ Counting idle workers. Values should be less than the `table-concurrency`/`region-concurrency`
+ settings and are typically be zero. Labels:
+
+ * **name**: `table` / `region`
+
+* **`lightning_kv_encoder`** (Counter)
+
+ Counting open and closed KV encoders. KV encoders are in-memory TiDB instances which converts
+ SQL INSERT statements into KV pairs. The net values should be bounded in a healthy situation.
+ Labels:
+
+ * **type**: `open` / `closed`
+
+* **`lightning_tables`** (Counter)
+
+ Counting number of tables processed and their status. Labels:
+
+ * **state**: `pending` / `written` / `closed` / `imported` / `altered_auto_inc` / `checksum` / `completed`
+ * **result**: `success` / `failure`
+
+* **`lightning_chunks`** (Counter)
+
+ Counting number of chunks processed and their status. Labels:
+
+ * **state**: `estimated` / `pending` / `running` / `finished` / `failed`
diff --git a/docs/en_US/05-Errors.md b/docs/en_US/05-Errors.md
new file mode 100644
index 0000000000000..b7746e9c0599b
--- /dev/null
+++ b/docs/en_US/05-Errors.md
@@ -0,0 +1,69 @@
+Common Errors
+=============
+
+When Lightning encounters an unrecoverable error, it exits with nonzero exit code and leaves the
+reason in the log file. Errors are typically printed at the end of the log. You can also search for
+the string `[error]` to look for non-fatal errors.
+
+Here are some commonly encountered errors in the `tidb-lightning` log file and their resolution.
+
+## checksum failed: checksum mismatched remote vs local
+
+**Cause**: The checksum of a table in the local data source and the remote imported database differ.
+There are several deeper reasons of this error:
+
+1. The table might have already been populated before. These old data would affect the final checksum.
+
+2. If the table does not have an integer PRIMARY KEY, some rows might be imported repeatedly between
+ checkpoints. This is a known bug to be fixed in the next release.
+
+3. If the remote checksum is 0 i.e. nothing is imported, it is possible that the cluster is too hot
+ and failed to take it any data.
+
+4. If the data is mechanically generated, ensure it respects the constrains of the table:
+
+ * AUTO_INCREMENT columns should be positive, and should not contain the value "0".
+ * There should be no duplicated entries in UNIQUE and PRIMARY KEYs.
+
+**Solutions**:
+
+1. Delete the corrupted data with `tidb-lightning-ctl --error-checkpoint-destroy=all`, and restart
+ Lightning to import the affected tables again.
+
+2. Consider using an external database to store the checkpoints (change `[checkpoint] dsn`) to
+ reduce the target database's load.
+
+## ResourceTemporarilyUnavailable("Too many open engines …: 8")
+
+**Cause**: The number of concurrent engine files exceeds the limit imposed by `tikv-importer`. This
+could be caused by misconfiguration. Additionally, if `tidb-lightning` exited abnormally, an engine
+file might be left at a dangling open state, which could cause this error as well.
+
+**Solutions**:
+
+1. Increase the value of `max-open-engine` setting in `tikv-importer.toml`. This value is typically
+ dictated by the available memory. This could be calculated as:
+
+ > Max Memory Usage ≈ `max-open-engine` × `write-buffer-size` × `max-write-buffer-number`
+
+2. Decrease the value of `table-concurrency` so it is less than `max-open-engine`.
+
+3. Restart `tikv-importer` to forcefully remove all engine files. This also removes all
+ partially imported tables, thus running `tidb-lightning-ctl --error-checkpoint-destroy=all` is
+ required.
+
+## cannot guess encoding for input file, please convert to UTF-8 manually
+
+**Cause**: Lightning only recognizes the UTF-8 and GB-18030 encodings for the table schemas. This
+error is emitted if the file isn't in any of these encodings. It is also possible that the file has
+mixed encoding e.g. contains a string in UTF-8 and another string in GB-18030, due to historical
+`ALTER TABLE` executions.
+
+**Solutions**:
+
+1. Fix the schema so that the file is entirely in either UTF-8 or GB-18030.
+
+2. Manually CREATE the affected tables in the target database, then set
+ `[mydumper] no-schema = true` to skip automatic table creation.
+
+
diff --git a/docs/en_US/06-FAQ.md b/docs/en_US/06-FAQ.md
new file mode 100644
index 0000000000000..6dd93e206320e
--- /dev/null
+++ b/docs/en_US/06-FAQ.md
@@ -0,0 +1,127 @@
+Frequently Asked Questions
+==========================
+
+What is the minimum TiDB/TiKV/PD cluster version supported by Lightning?
+------------------------------------------------------------------------
+
+The minimal version is 2.0.4.
+
+Does Lightning support importing multiple schemas (databases)?
+--------------------------------------------------------------
+
+Yes.
+
+What is the privilege requirements for the target database?
+-----------------------------------------------------------
+
+Lightning requires the following privileges:
+
+* SELECT
+* UPDATE
+* ALTER
+* CREATE
+* DROP
+
+If the target database is used to store checkpoints, it additionally requires these privileges:
+
+* INSERT
+* DELETE
+
+Lightning encountered an error when importing one table. Will it affect other tables? Will the process be terminated?
+---------------------------------------------------------------------------------------------------------------------
+
+If only one table has an error encountered, the rest will still be processed normally.
+
+How to ensure integrity of the imported data?
+---------------------------------------------
+
+Lightning by default performs checksum on the local data source and the imported tables. If
+there is checksum mismatch, the process would be aborted. These checksum information can be read
+from the log.
+
+You could also execute the `ADMIN CHECKSUM TABLE` SQL command on the target table to recompute the
+checksum of the imported data.
+
+```text
+mysql> ADMIN CHECKSUM TABLE `schema`.`table`;
++---------+------------+---------------------+-----------+-------------+
+| Db_name | Table_name | Checksum_crc64_xor | Total_kvs | Total_bytes |
++---------+------------+---------------------+-----------+-------------+
+| schema | table | 5505282386844578743 | 3 | 96 |
++---------+------------+---------------------+-----------+-------------+
+1 row in set (0.01 sec)
+```
+
+What kind of data source format is supported by Lightning?
+----------------------------------------------------------
+
+In version 2.1.0 we only support SQL dump generated by
+[mydumper](https://github.com/pingcap/mydumper) stored in the local filesystem.
+
+Could Lightning skip creating schema and tables?
+------------------------------------------------
+
+Yes. If you have already created the tables in the target database, you could set `no-schema = true`
+in the `[data-source]` section in `tidb-lightning.toml`. This makes Lightning skips the
+`CREATE TABLE` invocations and fetch the metadata directly from the target database. Lightning will
+exit with error if a table is actually missing.
+
+Can the Strict SQL Mode be disabled to allow importing invalid data?
+--------------------------------------------------------------------
+
+Yes. By default [`sql_mode`] used by Lightning is `"STRICT_TRANS_TABLES,NO_ENGINE_SUBSTITUTION"`,
+which disallows invalid data such as the date `1970-00-00`. The mode can be changed by modifying the
+`sql-mode` setting in the `[tidb]` section in `tidb-lightning.toml`.
+
+```toml
+...
+[tidb]
+sql-mode = ""
+...
+```
+
+[`sql_mode`]: https://dev.mysql.com/doc/refman/5.7/en/sql-mode.html
+
+Can one `tikv-importer` serve multiple `tidb-lightning` instances?
+------------------------------------------------------------------
+
+Yes, as long as every `tidb-lightning` instance operate on different tables.
+
+How to stop `tikv-importer`?
+----------------------------
+
+If it is deployed using TiDB-Ansible, run `scripts/stop_importer.sh` under the deployed folder.
+
+Otherwise, obtain the process ID with `ps aux | grep tikv-importer`, then run `kill «pid»`.
+
+How to stop `tidb-lightning`?
+-----------------------------
+
+If it is deployed using TiDB-Ansible, run `scripts/stop_lightning.sh` under the deployed folder.
+
+If `tidb-lightning` is running in foreground, simply press Ctrl+C to stop it.
+
+Otherwise, obtain the process ID with `ps aux | grep tidb-importer`, then run `kill -2 «pid»`.
+
+Why `tidb-lightning` suddenly quits while running in background?
+----------------------------------------------------------------
+
+It is potentially caused by starting `tidb-lightning` incorrectly, causing the system to send a
+SIGHUP signal to stop it. If this is the case, there should be a log entry like:
+
+```
+2018/08/10 07:29:08.310 main.go:47: [info] Got signal hangup to exit.
+```
+
+We do not recommend using `nohup` directly in the command line. Rather, put the `nohup` inside a
+script file and execute the script.
+
+Why my TiDB cluster is using lots of CPU and running very slowly after using Lightning?
+---------------------------------------------------------------------------------------
+
+If `tidb-lightning` abnormally exited, the cluster might be stuck in the "import mode", which is not
+suitable for production. You can force the cluster back to "normal mode" with:
+
+```sh
+tidb-lightning-ctl --switch-mode=normal
+```
diff --git a/docs/en_US/README.md b/docs/en_US/README.md
new file mode 100644
index 0000000000000..aaf62a29269ac
--- /dev/null
+++ b/docs/en_US/README.md
@@ -0,0 +1,26 @@
+TiDB Lightning
+==============
+
+**TiDB Lightning** is a tool for fast full import of large amounts of data into a TiDB cluster.
+Currently, we support reading SQL dump exported via mydumper.
+
+1. [Architecture](01-Architecture.md)
+2. [Deployment and Execution](02-Deployment.md)
+3. [Using Checkpoints](03-Checkpoints.md)
+4. [Metrics](04-Metrics.md)
+5. [Common Errors](05-Errors.md)
+6. [FAQ](06-FAQ.md)
+
+## Notes
+
+Before starting TiDB Lightning, note that:
+
+- During the import process, the cluster cannot provide normal services.
+- If `tidb-lightning` crashes, the cluster will be left in "import mode".
+ Forgetting to switch back to "normal mode" will lead to a high amount of uncompacted data on
+ the TiKV cluster, and will cause abnormally high CPU usage and stall.
+ You can manually switch the cluster back to "normal mode" via the `tidb-lightning-ctl` tool:
+
+ ```sh
+ bin/tidb-lightning-ctl -switch-mode=normal
+ ```
diff --git a/docs/en_US/tidb-lightning.svg b/docs/en_US/tidb-lightning.svg
new file mode 100644
index 0000000000000..d84e7f8834612
--- /dev/null
+++ b/docs/en_US/tidb-lightning.svg
@@ -0,0 +1,551 @@
+
+
+
+
diff --git a/docs/tidb-lightning-user-guide.md b/docs/tidb-lightning-user-guide.md
deleted file mode 100644
index 5cf2da8c36258..0000000000000
--- a/docs/tidb-lightning-user-guide.md
+++ /dev/null
@@ -1,238 +0,0 @@
-# TiDB Lightning User Guide
-
-TiDB Lightning is a data import tool which is used to fast import a large amount of data to the TiDB cluster. Currently, it only supports source data in the Mydumper file format and in the future it will support more formats like CSV.
-
-Now TiDB Lightning only supports full import of new tables. During the importing process, the cluster cannot provide services normally; as a result, TiDB Lightning is not suitable for importing data online.
-
-## TiDB Lightning architecture
-
-The following diagram shows the architecture of TiDB Lightning:
-
-![](./media/tidb-lightning-architecture.png)
-
-One set of TiDB Lightning has two components:
-
-- `tidb-lightning`
-
- The front-end part of TiDB Lightning. It transforms the source data into Key-Value (KV) pairs and writes the data into `tikv-importer`.
-
-- `tikv-importer`
-
- The back-end part of TiDB Lightning. It caches, sorts, and divides the KV pairs written by `tidb-lightning` and imports the KV pairs to the TiKV cluster.
-
-## TiDB Lightning workflow
-
-1. Before importing data, `tidb-lightning` automatically switches the TiKV mode to the import mode via API.
-2. `tidb-lightning` obtains data from the data source, transforms the source data into KV data, and then writes the data into `tikv-importer`.
-3. When the data written by `tidb-lightning` reaches a specific size, `tidb-lightning` sends the `Import` command to `tikv-importer`.
-4. `tikv-importer` divides and schedules the TiKV data of the target cluster and then imports the data to the TiKV cluster.
-5. `tidb-lightning` transforms and imports the source data continuously until it finishes importing the data in the source data directory.
-6. `tidb-lightning` performs the `Compact`, `Checksum`, and `Analyze` operation on tables in the target cluster.
-7. `tidb-lightning` automatically switches the TiKV mode to the normal mode. Then the TiDB cluster can provide services normally.
-
-## Deploy process
-
-### Notes
-
-Before deploying TiDB Lightning, you should take note that:
-
-- When TiDB Lightning is running, the TiDB cluster cannot provide services normally.
-- When you import data using TiDB Lightning, you cannot check some source data constraints such as the primary key conflict and unique index conflict. If needed, you can check using `ADMIN CHECK TABLE` via the MySQL client after importing, but it may take a long time.
-- Currently, TiDB Lightning does not support breakpoint. If any error occurs during importing, delete the data from the target cluster using `DROP TABLE` and import the data again.
-- If TiDB Lightning exits abnormally, you need to use the `-swtich-mode` command line parameter of `tidb-lightning` to manually close the import mode of the TiKV cluster and change it to the normal mode:
-
- ```
- ./bin/tidb-lightning -switch-mode normal
- ```
-
-### Hardware requirements
-
-#### Hardware requirements for separate deployment
-
-The following are the hardware requirements for deploying one set of TiDB Lighting. If you have enough machines, you can deploy multiple sets of TiDB Lightning, divide the source code based on the table grain and then import the data concurrently.
-
-- tidb-lightning
-
- - 32+ logical core CPU
- - 16 GB+ memory
- - 1 TB+ SSD
- - 10 Gigabit network card
- - Need to be deployed separately from the online business because TiDB Lighting fully consumes the CPU during runtime. Under certain circumstances, you can deploy it along with another component (like `tidb-server`) on one machine and configure to limit the CPU usage of `tidb-lightning`. See the `region-concurrency` part in the first step of [Deploy `tidb-lightning`](#deploy-tidb-lightning).
-
-- tikv-importer
-
- - 32+ logical core CPU
- - 32 GB+ memory
- - 1 TB+ SSD
- - 10 Gigabit network card
- - Need to be deployed separately from the online business because TiDB Lighting fully consumes the CPU, I/O and the network bandwidth during runtime. Under certain circumstances, you can deploy it along with other component (like `tikv-server`) on one machine, but the importing speed may be affected.
-
-#### Hardware requirements for mixed deployment
-
-Under certain circumstances, you can deploy `tidb-lightning` and `tikv-importer` (or another application) mixedly on one machine.
-
-- 32+ logical core CPU
-- 32 GB+ memory
-- 1 TB+ SSD
-- 10 Gigabit network card
-
-> **Note:** `tidb-lightning` is CPU intensive. If you use mixed deployment for it, you need to configure `region-concurrency` to limit the number of occupied CPU cores of `tidb-lightning`. Otherwise other applications may be affected.
-
-> You can configure the `region-concurrency` parameter of `tidb-lightning` to allocate 75% of CPU resources to `tidb-lightning`. For example, if CPU has 32 logical cores, you can set `region-concurrency` to 24.
-
-### Prepare
-
-Before importing, you should:
-
-- Deploy a set of TiDB cluster (TiDB version is 2.0.4 or later) which is the target cluster for importing (the target cluster).
-- Prepare the binary file and the configuration file of `tikv-importer`. It is recommended to use standalone deployment.
-- Prepare the binary file and the configuration file of `tidb-lightning`. It is recommended to use standalone deployment.
-
-Download the installation packages of `tikv-importer` and `tidb-lightning` via:
-
-https://download.pingcap.org/tidb-lightning-latest-linux-amd64.tar.gz
-
-### Deploy
-
-#### Deploy the TiDB cluster
-
-For details, see [Deploy TiDB Using Ansible](https://pingcap.com/docs/op-guide/ansible-deployment/).
-
-#### Deploy `tikv-importer`
-
-1. Configure `tikv-importer.toml.
-
- ```
- # TiKV Importer configuration file template
-
- # log file.
- log-file = "tikv-importer.log"
- # log level: trace, debug, info, warn, error, off.
- log-level = "info"
-
- [server]
- # the listening address of tikv-importer. tidb-lightning needs to connect to this address to write data. Set it to the actual IP address.
- addr = "172.16.30.4:20170"
- # size of thread pool for the gRPC server.
- grpc-concurrency = 16
-
- [metric]
- # the Prometheus client push job name.
- job = "tikv-importer"
- # the Prometheus client push interval.
- interval = "15s"
- # the Prometheus Pushgateway address.
- address = ""
-
- [rocksdb]
- # the maximum number of concurrent background jobs.
- max-background-jobs = 32
-
- [rocksdb.defaultcf]
- # amount of data to build up in memory before flushing data to the disk.
- write-buffer-size = "1GB"
- # the maximum number of write buffers that are built up in memory.
- max-write-buffer-number = 8
-
- # the compression algorithms used in different levels.
- # the algorithm at level-0 is used to compress KV data.
- # the algorithm at level-6 is used to compress SST files.
- # the algorithms at level-1 ~ level-5 are not used now.
- compression-per-level = ["lz4", "no", "no", "no", "no", "no", "zstd"]
-
- [import]
- # this directory is used to store the data written by `tidb-lightning`.
- import-dir = "/tmp/tikv/import"
- # the number of threads to handle RPC requests.
- num-threads = 16
- # the number of concurrent import jobs.
- num-import-jobs = 24
- # the stream channel window size. Stream will be blocked when the channel is full.
- stream-channel-window = 128
- ```
-
-2. Run the executable file of `tikv-importer`.
-
- ```
- nohup ./tikv-importer -C tikv-importer.toml > nohup.out &
- ```
-
-#### Deploy `tidb-lightning`
-
-1. Configure `tidb-lightning.toml`.
-
- ```
- ### tidb-lightning configuration
- [lightning]
-
- # background profile for debugging ( 0 to disable )
- pprof-port = 10089
-
-
- # table-concurrency controls the maximum handled tables concurrently while reading Mydumper SQL files. It can affect the tikv-importer memory usage amount.
- table-concurrency = 8
- # region-concurrency changes the concurrency number of data. It is set to the number of logical CPU cores by default and needs no configuration.
- # in mixed configuration, you can set it to 75% of the size of logical CPU cores.
- # region-concurrency default to runtime.NumCPU()
- # region-concurrency =
-
-
- # logging
- level = "info"
- file = "tidb-lightning.log"
- max-size = 128 # MB
- max-days = 28
- max-backups = 14
-
- [tikv-importer]
- # the listening address of tikv-importer. Change it to the actual address in tikv-importer.toml.
- addr = "172.16.31.4:20170"
- # size of batch to import KV data into TiKV: xxx (GB)
- batch-size = 500 # GB
-
- [mydumper]
- # block size of file reading
- read-block-size = 4096 # Byte (default = 4 KB)
- # divide source data file into multiple Region/chunk to execute restoring in parallel
- region-min-size = 268435456 # Byte (default = 256 MB)
- # the source data directory of Mydumper. tidb-lightning will automatically create the corresponding database and tables based on the schema file in the directory.
- data-source-dir = "/data/mydumper"
- # If no-schema is set to true, tidb-lightning will obtain the table schema information from tidb-server, instead of creating the database or tables based on the schema file of data-source-dir. This applies to manually creating tables or the situation where the table schema exits in TiDB.
- no-schema = false
-
-
-
- # configuration for TiDB (pick one of them if it has many TiDB servers) and the PD server.
- [tidb]
- # the target cluster information
- # the listening address of tidb-server. Setting one of them is enough.
- host = "127.0.0.1"
- port = 4000
- user = "root"
- password = ""
- # table schema information is fetched from TiDB via this status-port.
- status-port = 10080
- # the listening address of pd-server. Setting one of them is enough.
- pd-addr = "127.0.0.1:2379"
- # Lightning uses some code of TiDB (used as a library) and the flag controls its log level.
- log-level = "error"
- # set TiDB session variable to speed up performing the Checksum or Analyze operation on the table.
- distsql-scan-concurrency = 16
-
- # when data importing is finished, tidb-lightning can automatically perform the Checksum, Compact and Analyze operations.
- # it is recommended to set it to true in the production environment.
- # the execution order: Checksum -> Compact -> Analyze
- [post-restore]
- if it is set to true, tidb-lightning will perform the ADMIN CHECKSUM TABLE
operation on the tables one by one.
- checksum = true
- # if it is set to true, tidb-lightning will perform a full Compact operation on all the data. If the Compact operation fails, you can use ./bin/tidb-lightning -compact or the command of tikv-ctl to compact the data manually.
- compact = true
- # if it is set to true, tidb-lightning will perform the ANALYZE TABLE