byzer-org · allwefantasy · Sep 1, 2022 · Aug 31, 2022
diff --git a/streamingpro-core/src/main/java/tech/mlsql/tool/HDFSOperatorV2.scala b/streamingpro-core/src/main/java/tech/mlsql/tool/HDFSOperatorV2.scala
@@ -1,15 +1,14 @@
 package tech.mlsql.tool
 
-import java.io.{BufferedReader, ByteArrayOutputStream, File, InputStream, InputStreamReader}
 import org.apache.commons.io.FileUtils
-import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileStatus, FileSystem, Path}
+import org.apache.hadoop.fs._
 import org.apache.hadoop.io.IOUtils
 import org.apache.spark.MLSQLSparkUtils
 import tech.mlsql.common.utils.Md5
 import tech.mlsql.common.utils.path.PathFun
 
+import java.io.{FileSystem => _, _}
 import scala.collection.mutable.ArrayBuffer
 
 /**
@@ -266,31 +265,33 @@ object HDFSOperatorV2 {
     }
   }
 
-  def saveWithoutTopNLines(inPath: String, skipFirstNLines: Int, header: Boolean): String = {
+  def saveWithoutTopNLines(inPath: String, skipFirstNLines: Int, header: Boolean, encoding: String): String = {
     val fs = FileSystem.get(hadoopConfiguration)
+    val charset = encoding.trim.toLowerCase
     val src: Path = new Path(inPath)
     var br: BufferedReader = null
     var line: String = null
     var dos: FSDataOutputStream = null
     val pathElements = inPath.split(PathFun.pathSeparator)
-    val writePathParts = pathElements.take(pathElements.length - 1) :+ String.format("skipFirstNLines_%s_%s", String.valueOf(skipFirstNLines), pathElements(pathElements.length - 1))
+    val writePathParts = pathElements.take(pathElements.length - 1) :+
+      s"skipFirstNLines_${skipFirstNLines}_${charset}_${pathElements(pathElements.length - 1)}"
     val outPath = writePathParts.mkString(PathFun.pathSeparator)
     if (!fileExists(outPath)) {
       try {
         dos = fs.create(new Path(new java.io.File(outPath).getPath), true)
-        br = new BufferedReader(new InputStreamReader(fs.open(src)))
+        br = new BufferedReader(new InputStreamReader(fs.open(src), charset))
         line = br.readLine()
 
         var count = 1
         while (line != null) {
 
           if (header && count == 1) {
-            dos.writeBytes(line + "\n")
+            dos.write((line + "\n").getBytes(charset))
             line = br.readLine()
           }
 
           if (count >= skipFirstNLines) {
-            dos.writeBytes(line + "\n")
+            dos.write((line + "\n").getBytes(charset))
           }
           count += 1
           line = br.readLine()

diff --git a/streamingpro-mlsql/src/main/java/streaming/core/datasource/impl/MLSQLCSV.scala b/streamingpro-mlsql/src/main/java/streaming/core/datasource/impl/MLSQLCSV.scala
@@ -27,8 +27,9 @@ class MLSQLCSV(override val uid: String) extends MLSQLBaseFileSource with WowPar
           }
           case _ => false
         }
+        val encoding = config.config.getOrElse("encoding", "utf-8")
         val path = originPath
-        val newPath = HDFSOperatorV2.saveWithoutTopNLines(path, numsToSkip, header)
+        val newPath = HDFSOperatorV2.saveWithoutTopNLines(path, numsToSkip, header, encoding)
         newPath
       }
     }