Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Destination Postgres: fix \u0000(NULL) value processing #5336

Merged
merged 13 commits into from
Aug 30, 2021
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"destinationDefinitionId": "25c5221d-dce2-4163-ade9-739ef790f503",
"name": "Postgres",
"dockerRepository": "airbyte/destination-postgres",
"dockerImageTag": "0.3.9",
"dockerImageTag": "0.3.10",
"documentationUrl": "https://docs.airbyte.io/integrations/destinations/postgres",
"icon": "postgresql.svg"
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- destinationDefinitionId: 25c5221d-dce2-4163-ade9-739ef790f503
name: Postgres
dockerRepository: airbyte/destination-postgres
dockerImageTag: 0.3.9
dockerImageTag: 0.3.10
documentationUrl: https://docs.airbyte.io/integrations/destinations/postgres
icon: postgresql.svg
- destinationDefinitionId: b4c5d105-31fd-4817-96b6-cb923bfc04cb
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package io.airbyte.integrations.destination.buffered_stream_consumer;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.base.Preconditions;
import io.airbyte.commons.concurrency.VoidCallable;
import io.airbyte.commons.functional.CheckedConsumer;
Expand Down Expand Up @@ -99,7 +100,7 @@ public class BufferedStreamConsumer extends FailureTrackingAirbyteMessageConsume
private final Set<AirbyteStreamNameNamespacePair> streamNames;
private final List<AirbyteMessage> buffer;
private final ConfiguredAirbyteCatalog catalog;
private final CheckedFunction<String, Boolean, Exception> isValidRecord;
private final CheckedFunction<JsonNode, Boolean, Exception> isValidRecord;
private final Map<AirbyteStreamNameNamespacePair, Long> pairToIgnoredRecordCount;
private final Consumer<AirbyteMessage> outputRecordCollector;
private final int queueBatchSize;
Expand All @@ -115,7 +116,7 @@ public BufferedStreamConsumer(Consumer<AirbyteMessage> outputRecordCollector,
RecordWriter recordWriter,
CheckedConsumer<Boolean, Exception> onClose,
ConfiguredAirbyteCatalog catalog,
CheckedFunction<String, Boolean, Exception> isValidRecord,
CheckedFunction<JsonNode, Boolean, Exception> isValidRecord,
int queueBatchSize) {
this.outputRecordCollector = outputRecordCollector;
this.queueBatchSize = queueBatchSize;
Expand Down Expand Up @@ -151,13 +152,12 @@ protected void acceptTracked(AirbyteMessage message) throws Exception {
if (message.getType() == Type.RECORD) {
final AirbyteRecordMessage recordMessage = message.getRecord();
final AirbyteStreamNameNamespacePair stream = AirbyteStreamNameNamespacePair.fromRecordMessage(recordMessage);
final String data = Jsons.serialize(message.getRecord().getData());

if (!streamNames.contains(stream)) {
throwUnrecognizedStream(catalog, message);
}

if (!isValidRecord.apply(data)) {
if (!isValidRecord.apply(message.getRecord().getData())) {
pairToIgnoredRecordCount.put(stream, pairToIgnoredRecordCount.getOrDefault(stream, 0L) + 1L);
return;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.buffered_stream_consumer;

import io.airbyte.protocol.models.AirbyteMessage;

/**
* Allows specifying transformation logic from Airbyte Json to String.
*/
public interface StreamDateFormatter {

String getFormattedDate(AirbyteMessage airbyteMessage);

}
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import static org.mockito.Mockito.verifyNoInteractions;
import static org.mockito.Mockito.when;

import com.fasterxml.jackson.databind.JsonNode;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import io.airbyte.commons.concurrency.VoidCallable;
Expand Down Expand Up @@ -85,7 +86,7 @@ public class BufferedStreamConsumerTest {
private VoidCallable onStart;
private RecordWriter recordWriter;
private CheckedConsumer<Boolean, Exception> onClose;
private CheckedFunction<String, Boolean, Exception> isValidRecord;
private CheckedFunction<JsonNode, Boolean, Exception> isValidRecord;
private Consumer<AirbyteMessage> outputRecordCollector;

@SuppressWarnings("unchecked")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@ plugins {
id 'airbyte-integration-test-java'
}

application {
mainClass = 'io.airbyte.integrations.destination.jdbc.JdbcDestination'
}

dependencies {
implementation 'com.google.cloud:google-cloud-storage:1.113.16'
implementation 'com.google.auth:google-auth-library-oauth2-http:0.25.5'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* MIT License
*
* Copyright (c) 2020 Airbyte
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package io.airbyte.integrations.destination.jdbc;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.util.function.Function;
import java.util.function.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DataAdapter {

private static final Logger LOGGER = LoggerFactory.getLogger(DataAdapter.class);

private final Predicate<JsonNode> filterValueNode;
private final Function<JsonNode, JsonNode> valueNodeAdapter;

/**
* Data adapter allows applying destination data rules. For example, Postgres destination can't
* process text value with \u0000 unicode. You can describe filter condition for a value node and
* function which adapts filtered value nodes.
*
* @param filterValueNode - filter condition which decide which value node should be adapted
* @param valueNodeAdapter - transformation function which returns adapted value node
*/
public DataAdapter(
Predicate<JsonNode> filterValueNode,
Function<JsonNode, JsonNode> valueNodeAdapter) {
this.filterValueNode = filterValueNode;
this.valueNodeAdapter = valueNodeAdapter;
}

public void adapt(JsonNode messageData) {
if (messageData != null) {
adaptAllValueNodes(messageData);
}
}

private void adaptAllValueNodes(JsonNode rootNode) {
adaptValueNodes(null, rootNode, null);
}

/**
* The method inspects json node. In case, it's a value node we check the node by CheckFunction and
* apply ValueNodeAdapter. Filtered nodes will be updated by adapted version. If element is an array
* or an object, this we run the method recursively for them.
*
* @param fieldName Name of a json node
* @param node Json node
* @param parentNode Parent json node
*/
private void adaptValueNodes(String fieldName, JsonNode node, JsonNode parentNode) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am not sure I follow the logic of this method. When can node.isValueNode() be true? Also I dont like the fact that fieldName can be null and we dont have a null check.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if an element contains a value - it's a value node. An element also might be an array or object.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@DoNotPanicUA can you put a comment on this method explaining how it works so that its clear for anyone who is reading this code for the first time

if (node.isValueNode() && filterValueNode.test(node)) {
if (fieldName != null) {
var adaptedNode = valueNodeAdapter.apply(node);
((ObjectNode) parentNode).set(fieldName, adaptedNode);
} else
throw new RuntimeException("Unexpected value node without fieldName. Node: " + node);
} else if (node.isArray()) {
node.elements().forEachRemaining(arrayNode -> adaptValueNodes(null, arrayNode, node));
} else {
node.fields().forEachRemaining(stringJsonNodeEntry -> adaptValueNodes(stringJsonNodeEntry.getKey(), stringJsonNodeEntry.getValue(), node));
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -29,28 +29,22 @@
import io.airbyte.db.jdbc.JdbcDatabase;
import io.airbyte.integrations.base.JavaBaseConstants;
import io.airbyte.protocol.models.AirbyteRecordMessage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.sql.SQLException;
import java.sql.Timestamp;
import java.time.Instant;
import java.util.List;
import java.util.UUID;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.postgresql.copy.CopyManager;
import org.postgresql.core.BaseConnection;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DefaultSqlOperations implements SqlOperations {
public abstract class JdbcSqlOperations implements SqlOperations {

private static final Logger LOGGER = LoggerFactory.getLogger(DefaultSqlOperations.class);
private static final Logger LOGGER = LoggerFactory.getLogger(JdbcSqlOperations.class);

@Override
public void createSchemaIfNotExists(JdbcDatabase database, String schemaName) throws Exception {
Expand All @@ -77,37 +71,6 @@ public String createTableQuery(JdbcDatabase database, String schemaName, String
schemaName, tableName, JavaBaseConstants.COLUMN_NAME_AB_ID, JavaBaseConstants.COLUMN_NAME_DATA, JavaBaseConstants.COLUMN_NAME_EMITTED_AT);
}

@Override
public void insertRecords(JdbcDatabase database, List<AirbyteRecordMessage> records, String schemaName, String tmpTableName) throws SQLException {
if (records.isEmpty()) {
return;
}

// todo (cgardens) - move this into a postgres version of this. this syntax is postgres-specific
database.execute(connection -> {
File tmpFile = null;
try {
tmpFile = Files.createTempFile(tmpTableName + "-", ".tmp").toFile();
writeBatchToFile(tmpFile, records);

var copyManager = new CopyManager(connection.unwrap(BaseConnection.class));
var sql = String.format("COPY %s.%s FROM stdin DELIMITER ',' CSV", schemaName, tmpTableName);
var bufferedReader = new BufferedReader(new FileReader(tmpFile));
copyManager.copyIn(sql, bufferedReader);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
try {
if (tmpFile != null) {
Files.delete(tmpFile.toPath());
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
}

protected void writeBatchToFile(File tmpFile, List<AirbyteRecordMessage> records) throws Exception {
PrintWriter writer = null;
try {
Expand Down Expand Up @@ -167,8 +130,28 @@ public boolean isSchemaRequired() {
}

@Override
public boolean isValidData(String data) {
public boolean isValidData(JsonNode data) {
return true;
}

@Override
public final void insertRecords(JdbcDatabase database,
List<AirbyteRecordMessage> records,
String schemaName,
String tableName)
throws Exception {
records.forEach(airbyteRecordMessage -> getDataAdapter().adapt(airbyteRecordMessage.getData()));
insertRecordsInternal(database, records, schemaName, tableName);
}

protected abstract void insertRecordsInternal(JdbcDatabase database,
List<AirbyteRecordMessage> records,
String schemaName,
String tableName)
throws Exception;

protected DataAdapter getDataAdapter() {
return new DataAdapter(j -> false, c -> c);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

package io.airbyte.integrations.destination.jdbc;

import com.fasterxml.jackson.databind.JsonNode;
import io.airbyte.db.jdbc.JdbcDatabase;
import io.airbyte.protocol.models.AirbyteRecordMessage;
import java.util.List;
Expand Down Expand Up @@ -115,7 +116,7 @@ public interface SqlOperations {
/**
* Check if the data record is valid and ok to be written to destination
*/
boolean isValidData(final String data);
boolean isValidData(final JsonNode data);

/**
* Denotes whether the destination has the concept of schema or not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,11 @@ private static RecordWriter recordWriterFunction(Map<AirbyteStreamNameNamespaceP
return (AirbyteStreamNameNamespacePair pair, List<AirbyteRecordMessage> records) -> {
for (AirbyteRecordMessage recordMessage : records) {
var id = UUID.randomUUID();
var data = Jsons.serialize(recordMessage.getData());
if (sqlOperations.isValidData(data)) {
if (sqlOperations.isValidData(recordMessage.getData())) {
// TODO Truncate json data instead of throwing whole record away?
// or should we upload it into a special rejected record folder in s3 instead?
var emittedAt = Timestamp.from(Instant.ofEpochMilli(recordMessage.getEmittedAt()));
pairToCopier.get(pair).write(id, data, emittedAt);
pairToCopier.get(pair).write(id, Jsons.serialize(recordMessage.getData()), emittedAt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a problem, we are not going to do Jsons.serialize twice for Redshift. First in the isValidData method and then here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently, it's designed in a way that we don't have many options. Original implementation does additional serialization for all destinations. Here we have one additional serialization for one destination and only for the Copy flow. So, we already do a significant improvement here.
I propose to create a new issue for the RedShift destination improvement in order to unblock the Postgres issue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool! Please create a follow up issue to resolve this

} else {
pairToIgnoredRecordCount.put(pair, pairToIgnoredRecordCount.getOrDefault(pair, 0L) + 1L);
}
Expand Down
Loading