-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature-#1918][s3] Add support for reading all types of documents su…
…pported by Apache Tika
- Loading branch information
Showing
15 changed files
with
695 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!-- | ||
Licensed to the Apache Software Foundation (ASF) under one | ||
or more contributor license agreements. See the NOTICE file | ||
distributed with this work for additional information | ||
regarding copyright ownership. The ASF licenses this file | ||
to you under the Apache License, Version 2.0 (the | ||
"License"); you may not use this file except in compliance | ||
with the License. You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, | ||
software distributed under the License is distributed on an | ||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
KIND, either express or implied. See the License for the | ||
specific language governing permissions and limitations | ||
under the License. | ||
--> | ||
|
||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<parent> | ||
<artifactId>chunjun-connectors</artifactId> | ||
<groupId>com.dtstack.chunjun</groupId> | ||
<version>${revision}</version> | ||
</parent> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<artifactId>chunjun-connector-format-base</artifactId> | ||
<name>ChunJun : Connector : Format Base</name> | ||
|
||
<properties> | ||
<tika.version>2.8.0</tika.version> | ||
</properties> | ||
|
||
<dependencies> | ||
<!-- 部署时上传[tika-core-2.8.0.jar] https://repo1.maven.org/maven2/org/apache/tika/tika-core/2.8.0/tika-core-2.8.0.jar --> | ||
<dependency> | ||
<groupId>org.apache.tika</groupId> | ||
<artifactId>tika-core</artifactId> | ||
<version>${tika.version}</version> | ||
<scope>provided</scope> | ||
</dependency> | ||
|
||
<!-- 部署时上传[tika-server-standard-2.8.0.jar] https://archive.apache.org/dist/tika/2.8.0/tika-server-standard-2.8.0.jar --> | ||
<dependency> | ||
<groupId>org.apache.tika</groupId> | ||
<artifactId>tika-parsers-standard-package</artifactId> | ||
<version>${tika.version}</version> | ||
<scope>provided</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
37 changes: 37 additions & 0 deletions
37
...-format-base/src/main/java/com/dtstack/chunjun/connector/format/base/common/TikaData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.dtstack.chunjun.connector.format.base.common; | ||
|
||
import lombok.Data; | ||
|
||
@Data | ||
public class TikaData { | ||
|
||
private String[] data; | ||
private boolean end; | ||
|
||
public TikaData(String[] data, boolean end) { | ||
this.data = data; | ||
this.end = end; | ||
} | ||
|
||
public TikaData(String[] data) { | ||
this.data = data; | ||
} | ||
} |
47 changes: 47 additions & 0 deletions
47
...t-base/src/main/java/com/dtstack/chunjun/connector/format/base/config/TikaReadConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.dtstack.chunjun.connector.format.base.config; | ||
|
||
import lombok.Data; | ||
|
||
import java.io.Serializable; | ||
|
||
@Data | ||
public class TikaReadConfig implements Serializable { | ||
|
||
public static final String ORIGINAL_FILENAME = "_ORIGINAL_FILENAME"; | ||
|
||
private static final long serialVersionUID = 9142075335239994317L; | ||
|
||
/** 是否启用tika提取 */ | ||
private boolean useExtract = false; | ||
|
||
/** 内容重合度比例值 0-100 */ | ||
private int overlapRatio = 0; | ||
|
||
/** 是否启动分块 */ | ||
private boolean enableChunk = false; | ||
|
||
/** 分块大小 */ | ||
private int chunkSize = -1; | ||
|
||
public boolean isEnableChunk() { | ||
return chunkSize > 0 ? true : false; | ||
} | ||
} |
43 changes: 43 additions & 0 deletions
43
...mat-base/src/main/java/com/dtstack/chunjun/connector/format/base/options/TikaOptions.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.dtstack.chunjun.connector.format.base.options; | ||
|
||
import org.apache.flink.configuration.ConfigOption; | ||
import org.apache.flink.configuration.ConfigOptions; | ||
|
||
public class TikaOptions { | ||
|
||
public static final ConfigOption<Boolean> USE_EXTRACT = | ||
ConfigOptions.key("tika-use-extract") | ||
.booleanType() | ||
.defaultValue(false) | ||
.withDescription("use tika extract"); | ||
|
||
public static final ConfigOption<Integer> OVERLAP_RATIO = | ||
ConfigOptions.key("tika-overlap-ratio") | ||
.intType() | ||
.defaultValue(0) | ||
.withDescription("content overlap ratio"); | ||
|
||
public static final ConfigOption<Integer> CHUNK_SIZE = | ||
ConfigOptions.key("tika-chunk-size") | ||
.intType() | ||
.defaultValue(-1) | ||
.withDescription("chunk size"); | ||
} |
119 changes: 119 additions & 0 deletions
119
...-base/src/main/java/com/dtstack/chunjun/connector/format/base/source/TikaInputFormat.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.dtstack.chunjun.connector.format.base.source; | ||
|
||
import com.dtstack.chunjun.connector.format.base.common.TikaData; | ||
import com.dtstack.chunjun.connector.format.base.config.TikaReadConfig; | ||
|
||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.commons.lang3.concurrent.BasicThreadFactory; | ||
|
||
import java.io.Closeable; | ||
import java.io.InputStream; | ||
import java.util.concurrent.BlockingQueue; | ||
import java.util.concurrent.LinkedBlockingDeque; | ||
import java.util.concurrent.LinkedBlockingQueue; | ||
import java.util.concurrent.ThreadPoolExecutor; | ||
import java.util.concurrent.TimeUnit; | ||
|
||
import static java.util.concurrent.TimeUnit.NANOSECONDS; | ||
|
||
@Slf4j | ||
public class TikaInputFormat implements Closeable { | ||
private ThreadPoolExecutor executorService; | ||
private final BlockingQueue<TikaData> queue = new LinkedBlockingQueue<>(4096); | ||
private TikaReadConfig tikaReadConfig; | ||
private TikaData row; | ||
private int fieldCount; | ||
|
||
public TikaInputFormat(TikaReadConfig tikaReadConfig, int fieldCount) { | ||
this.tikaReadConfig = tikaReadConfig; | ||
this.fieldCount = fieldCount; | ||
} | ||
|
||
public void open(InputStream inputStream, String originalFilename) { | ||
this.executorService = | ||
new ThreadPoolExecutor( | ||
1, | ||
1, | ||
0, | ||
NANOSECONDS, | ||
new LinkedBlockingDeque<>(2), | ||
new BasicThreadFactory.Builder() | ||
.namingPattern("tika-schedule-pool-%d") | ||
.daemon(false) | ||
.build()); | ||
TikaReaderExecutor executor = | ||
new TikaReaderExecutor(tikaReadConfig, queue, inputStream, originalFilename); | ||
executorService.execute(executor); | ||
} | ||
|
||
public boolean hasNext() { | ||
try { | ||
row = queue.poll(3000L, TimeUnit.MILLISECONDS); | ||
// 如果没有数据,则继续等待 | ||
if (row == null) { | ||
log.warn("Waiting for queue get tika data"); | ||
hasNext(); | ||
} | ||
if (row != null && row.isEnd()) { | ||
return false; | ||
} | ||
return true; | ||
} catch (InterruptedException e) { | ||
throw new RuntimeException( | ||
"cannot get data from the queue because the current thread is interrupted.", e); | ||
} | ||
} | ||
|
||
/** 根据声明的字段个数,对数据进行补全 */ | ||
public String[] nextRecord() { | ||
String[] data = row.getData(); | ||
if (fieldCount == data.length) { | ||
return data; | ||
} | ||
if (fieldCount < data.length) { | ||
fieldCount = data.length; | ||
} | ||
return formatValue(data); | ||
} | ||
|
||
private String[] formatValue(String[] data) { | ||
String[] record = initDataContainer(fieldCount, ""); | ||
// because fieldCount is always >= data.length | ||
System.arraycopy(data, 0, record, 0, data.length); | ||
return record; | ||
} | ||
|
||
private String[] initDataContainer(int capacity, String defValue) { | ||
String[] container = new String[capacity]; | ||
for (int i = 0; i < capacity; i++) { | ||
container[i] = defValue; | ||
} | ||
return container; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
if (executorService != null) { | ||
executorService.shutdown(); | ||
queue.clear(); | ||
} | ||
} | ||
} |
Oops, something went wrong.