Skip to content

Commit

Permalink
🐛 Destination Postgres: fix \u0000(NULL) value processing (#5336)
Browse files Browse the repository at this point in the history
* fix \u0000(NULL) value processing for Postgres + move postgres impl of SqlOperations to PostgresSqlOperations.

* changelog + format

* incr release version

* Add generic solution to adopt messages for a destination + remove unnecessary serialization

* revert version for build

* minor review fixes

* format

* add comments

* format

* incr version
  • Loading branch information
DoNotPanicUA authored Aug 30, 2021
1 parent 8c1ff00 commit b18bd43
Show file tree
Hide file tree
Showing 24 changed files with 398 additions and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"destinationDefinitionId": "25c5221d-dce2-4163-ade9-739ef790f503",
"name": "Postgres",
"dockerRepository": "airbyte/destination-postgres",
"dockerImageTag": "0.3.9",
"dockerImageTag": "0.3.10",
"documentationUrl": "https://docs.airbyte.io/integrations/destinations/postgres",
"icon": "postgresql.svg"
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- destinationDefinitionId: 25c5221d-dce2-4163-ade9-739ef790f503
name: Postgres
dockerRepository: airbyte/destination-postgres
dockerImageTag: 0.3.9
dockerImageTag: 0.3.10
documentationUrl: https://docs.airbyte.io/integrations/destinations/postgres
icon: postgresql.svg
- destinationDefinitionId: b4c5d105-31fd-4817-96b6-cb923bfc04cb
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package io.airbyte.integrations.destination.buffered_stream_consumer;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Preconditions;
import io.airbyte.commons.concurrency.VoidCallable;
import io.airbyte.commons.functional.CheckedConsumer;
Expand Down Expand Up @@ -99,7 +100,7 @@ public class BufferedStreamConsumer extends FailureTrackingAirbyteMessageConsume
private final Set<AirbyteStreamNameNamespacePair> streamNames;
private final List<AirbyteMessage> buffer;
private final ConfiguredAirbyteCatalog catalog;
private final CheckedFunction<String, Boolean, Exception> isValidRecord;
private final CheckedFunction<JsonNode, Boolean, Exception> isValidRecord;
private final Map<AirbyteStreamNameNamespacePair, Long> pairToIgnoredRecordCount;
private final Consumer<AirbyteMessage> outputRecordCollector;
private final int queueBatchSize;
Expand All @@ -115,7 +116,7 @@ public BufferedStreamConsumer(Consumer<AirbyteMessage> outputRecordCollector,
RecordWriter recordWriter,
CheckedConsumer<Boolean, Exception> onClose,
ConfiguredAirbyteCatalog catalog,
CheckedFunction<String, Boolean, Exception> isValidRecord,
CheckedFunction<JsonNode, Boolean, Exception> isValidRecord,
int queueBatchSize) {
this.outputRecordCollector = outputRecordCollector;
this.queueBatchSize = queueBatchSize;
Expand Down Expand Up @@ -151,13 +152,12 @@ protected void acceptTracked(AirbyteMessage message) throws Exception {
if (message.getType() == Type.RECORD) {
final AirbyteRecordMessage recordMessage = message.getRecord();
final AirbyteStreamNameNamespacePair stream = AirbyteStreamNameNamespacePair.fromRecordMessage(recordMessage);
final String data = Jsons.serialize(message.getRecord().getData());

if (!streamNames.contains(stream)) {
throwUnrecognizedStream(catalog, message);
}

if (!isValidRecord.apply(data)) {
if (!isValidRecord.apply(message.getRecord().getData())) {
pairToIgnoredRecordCount.put(stream, pairToIgnoredRecordCount.getOrDefault(stream, 0L) + 1L);
return;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.buffered_stream_consumer;

import io.airbyte.protocol.models.AirbyteMessage;

/**
* Allows specifying transformation logic from Airbyte Json to String.
*/
public interface StreamDateFormatter {

String getFormattedDate(AirbyteMessage airbyteMessage);

}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import static org.mockito.Mockito.verifyNoInteractions;
import static org.mockito.Mockito.when;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import io.airbyte.commons.concurrency.VoidCallable;
Expand Down Expand Up @@ -85,7 +86,7 @@ public class BufferedStreamConsumerTest {
private VoidCallable onStart;
private RecordWriter recordWriter;
private CheckedConsumer<Boolean, Exception> onClose;
private CheckedFunction<String, Boolean, Exception> isValidRecord;
private CheckedFunction<JsonNode, Boolean, Exception> isValidRecord;
private Consumer<AirbyteMessage> outputRecordCollector;

@SuppressWarnings("unchecked")
Expand Down
4 changes: 0 additions & 4 deletions airbyte-integrations/connectors/destination-jdbc/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ plugins {
id 'airbyte-integration-test-java'
}

application {
mainClass = 'io.airbyte.integrations.destination.jdbc.JdbcDestination'
}

dependencies {
implementation 'com.google.cloud:google-cloud-storage:1.113.16'
implementation 'com.google.auth:google-auth-library-oauth2-http:0.25.5'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.jdbc;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.util.function.Function;
import java.util.function.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DataAdapter {

private static final Logger LOGGER = LoggerFactory.getLogger(DataAdapter.class);

private final Predicate<JsonNode> filterValueNode;
private final Function<JsonNode, JsonNode> valueNodeAdapter;

/**
* Data adapter allows applying destination data rules. For example, Postgres destination can't
* process text value with \u0000 unicode. You can describe filter condition for a value node and
* function which adapts filtered value nodes.
*
* @param filterValueNode - filter condition which decide which value node should be adapted
* @param valueNodeAdapter - transformation function which returns adapted value node
*/
public DataAdapter(
Predicate<JsonNode> filterValueNode,
Function<JsonNode, JsonNode> valueNodeAdapter) {
this.filterValueNode = filterValueNode;
this.valueNodeAdapter = valueNodeAdapter;
}

public void adapt(JsonNode messageData) {
if (messageData != null) {
adaptAllValueNodes(messageData);
}
}

private void adaptAllValueNodes(JsonNode rootNode) {
adaptValueNodes(null, rootNode, null);
}

/**
* The method inspects json node. In case, it's a value node we check the node by CheckFunction and
* apply ValueNodeAdapter. Filtered nodes will be updated by adapted version. If element is an array
* or an object, this we run the method recursively for them.
*
* @param fieldName Name of a json node
* @param node Json node
* @param parentNode Parent json node
*/
private void adaptValueNodes(String fieldName, JsonNode node, JsonNode parentNode) {
if (node.isValueNode() && filterValueNode.test(node)) {
if (fieldName != null) {
var adaptedNode = valueNodeAdapter.apply(node);
((ObjectNode) parentNode).set(fieldName, adaptedNode);
} else
throw new RuntimeException("Unexpected value node without fieldName. Node: " + node);
} else if (node.isArray()) {
node.elements().forEachRemaining(arrayNode -> adaptValueNodes(null, arrayNode, node));
} else {
node.fields().forEachRemaining(stringJsonNodeEntry -> adaptValueNodes(stringJsonNodeEntry.getKey(), stringJsonNodeEntry.getValue(), node));
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,22 @@
import io.airbyte.db.jdbc.JdbcDatabase;
import io.airbyte.integrations.base.JavaBaseConstants;
import io.airbyte.protocol.models.AirbyteRecordMessage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.time.Instant;
import java.util.List;
import java.util.UUID;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.postgresql.copy.CopyManager;
import org.postgresql.core.BaseConnection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DefaultSqlOperations implements SqlOperations {
public abstract class JdbcSqlOperations implements SqlOperations {

private static final Logger LOGGER = LoggerFactory.getLogger(DefaultSqlOperations.class);
private static final Logger LOGGER = LoggerFactory.getLogger(JdbcSqlOperations.class);

@Override
public void createSchemaIfNotExists(JdbcDatabase database, String schemaName) throws Exception {
Expand All @@ -77,37 +71,6 @@ public String createTableQuery(JdbcDatabase database, String schemaName, String
schemaName, tableName, JavaBaseConstants.COLUMN_NAME_AB_ID, JavaBaseConstants.COLUMN_NAME_DATA, JavaBaseConstants.COLUMN_NAME_EMITTED_AT);
}

@Override
public void insertRecords(JdbcDatabase database, List<AirbyteRecordMessage> records, String schemaName, String tmpTableName) throws SQLException {
if (records.isEmpty()) {
return;
}

// todo (cgardens) - move this into a postgres version of this. this syntax is postgres-specific
database.execute(connection -> {
File tmpFile = null;
try {
tmpFile = Files.createTempFile(tmpTableName + "-", ".tmp").toFile();
writeBatchToFile(tmpFile, records);

var copyManager = new CopyManager(connection.unwrap(BaseConnection.class));
var sql = String.format("COPY %s.%s FROM stdin DELIMITER ',' CSV", schemaName, tmpTableName);
var bufferedReader = new BufferedReader(new FileReader(tmpFile));
copyManager.copyIn(sql, bufferedReader);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
if (tmpFile != null) {
Files.delete(tmpFile.toPath());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
}

protected void writeBatchToFile(File tmpFile, List<AirbyteRecordMessage> records) throws Exception {
PrintWriter writer = null;
try {
Expand Down Expand Up @@ -167,8 +130,28 @@ public boolean isSchemaRequired() {
}

@Override
public boolean isValidData(String data) {
public boolean isValidData(JsonNode data) {
return true;
}

@Override
public final void insertRecords(JdbcDatabase database,
List<AirbyteRecordMessage> records,
String schemaName,
String tableName)
throws Exception {
records.forEach(airbyteRecordMessage -> getDataAdapter().adapt(airbyteRecordMessage.getData()));
insertRecordsInternal(database, records, schemaName, tableName);
}

protected abstract void insertRecordsInternal(JdbcDatabase database,
List<AirbyteRecordMessage> records,
String schemaName,
String tableName)
throws Exception;

protected DataAdapter getDataAdapter() {
return new DataAdapter(j -> false, c -> c);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package io.airbyte.integrations.destination.jdbc;

import com.fasterxml.jackson.databind.JsonNode;
import io.airbyte.db.jdbc.JdbcDatabase;
import io.airbyte.protocol.models.AirbyteRecordMessage;
import java.util.List;
Expand Down Expand Up @@ -115,7 +116,7 @@ public interface SqlOperations {
/**
* Check if the data record is valid and ok to be written to destination
*/
boolean isValidData(final String data);
boolean isValidData(final JsonNode data);

/**
* Denotes whether the destination has the concept of schema or not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,11 @@ private static RecordWriter recordWriterFunction(Map<AirbyteStreamNameNamespaceP
return (AirbyteStreamNameNamespacePair pair, List<AirbyteRecordMessage> records) -> {
for (AirbyteRecordMessage recordMessage : records) {
var id = UUID.randomUUID();
var data = Jsons.serialize(recordMessage.getData());
if (sqlOperations.isValidData(data)) {
if (sqlOperations.isValidData(recordMessage.getData())) {
// TODO Truncate json data instead of throwing whole record away?
// or should we upload it into a special rejected record folder in s3 instead?
var emittedAt = Timestamp.from(Instant.ofEpochMilli(recordMessage.getEmittedAt()));
pairToCopier.get(pair).write(id, data, emittedAt);
pairToCopier.get(pair).write(id, Jsons.serialize(recordMessage.getData()), emittedAt);
} else {
pairToIgnoredRecordCount.put(pair, pairToIgnoredRecordCount.getOrDefault(pair, 0L) + 1L);
}
Expand Down
Loading

0 comments on commit b18bd43

Please sign in to comment.