-
Notifications
You must be signed in to change notification settings - Fork 4.1k
/
source.py
305 lines (273 loc) · 14.3 KB
/
source.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#
import logging
from datetime import datetime, timedelta, timezone
from typing import Any, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union
import isodate
import pendulum
from airbyte_cdk.logger import AirbyteLogFormatter
from airbyte_cdk.models import (
AirbyteMessage,
AirbyteStateMessage,
ConfiguredAirbyteCatalog,
ConfiguredAirbyteStream,
FailureType,
Level,
SyncMode,
)
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker
from airbyte_cdk.sources.message import InMemoryMessageRepository
from airbyte_cdk.sources.source import TState
from airbyte_cdk.sources.streams import Stream
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField, FinalStateCursor
from airbyte_cdk.sources.streams.http.requests_native_auth import TokenAuthenticator
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
from dateutil.relativedelta import relativedelta
from pendulum.parsing.exceptions import ParserError
from requests import codes, exceptions # type: ignore[import]
from .api import PARENT_SALESFORCE_OBJECTS, UNSUPPORTED_BULK_API_SALESFORCE_OBJECTS, UNSUPPORTED_FILTERING_STREAMS, Salesforce
from .streams import (
LOOKBACK_SECONDS,
BulkIncrementalSalesforceStream,
BulkSalesforceStream,
BulkSalesforceSubStream,
Describe,
IncrementalRestSalesforceStream,
RestSalesforceStream,
RestSalesforceSubStream,
)
_DEFAULT_CONCURRENCY = 10
_MAX_CONCURRENCY = 10
logger = logging.getLogger("airbyte")
class AirbyteStopSync(AirbyteTracedException):
pass
class SourceSalesforce(ConcurrentSourceAdapter):
DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
START_DATE_OFFSET_IN_YEARS = 2
MAX_WORKERS = 5
stop_sync_on_stream_failure = True
message_repository = InMemoryMessageRepository(Level(AirbyteLogFormatter.level_mapping[logger.level]))
def __init__(self, catalog: Optional[ConfiguredAirbyteCatalog], config: Optional[Mapping[str, Any]], state: Optional[TState], **kwargs):
if config:
concurrency_level = min(config.get("num_workers", _DEFAULT_CONCURRENCY), _MAX_CONCURRENCY)
else:
concurrency_level = _DEFAULT_CONCURRENCY
logger.info(f"Using concurrent cdk with concurrency level {concurrency_level}")
concurrent_source = ConcurrentSource.create(
concurrency_level, concurrency_level // 2, logger, self._slice_logger, self.message_repository
)
super().__init__(concurrent_source)
self.catalog = catalog
self.state = state
self._job_tracker = JobTracker(limit=5)
@staticmethod
def _get_sf_object(config: Mapping[str, Any]) -> Salesforce:
sf = Salesforce(**config)
sf.login()
return sf
@staticmethod
def _validate_stream_slice_step(stream_slice_step: str):
if stream_slice_step:
try:
duration = pendulum.parse(stream_slice_step)
if not isinstance(duration, pendulum.Duration):
message = "Stream slice step Interval should be provided in ISO 8601 format."
elif duration < pendulum.Duration(seconds=1):
message = "Stream slice step Interval is too small. It should be no less than 1 second. Please set higher value and try again."
else:
return
raise ParserError(message)
except ParserError as e:
internal_message = "Incorrect stream slice step"
raise AirbyteTracedException(failure_type=FailureType.config_error, internal_message=internal_message, message=e.args[0])
def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[str]]:
self._validate_stream_slice_step(config.get("stream_slice_step"))
salesforce = self._get_sf_object(config)
salesforce.describe()
return True, None
@classmethod
def _get_api_type(cls, stream_name: str, json_schema: Mapping[str, Any], force_use_bulk_api: bool) -> str:
"""Get proper API type: rest or bulk"""
# Salesforce BULK API currently does not support loading fields with data type base64 and compound data
properties = json_schema.get("properties", {})
properties_not_supported_by_bulk = {
key: value for key, value in properties.items() if value.get("format") == "base64" or "object" in value["type"]
}
rest_only = stream_name in UNSUPPORTED_BULK_API_SALESFORCE_OBJECTS
if rest_only:
logger.warning(f"BULK API is not supported for stream: {stream_name}")
return "rest"
if force_use_bulk_api and properties_not_supported_by_bulk:
logger.warning(
f"Following properties will be excluded from stream: {stream_name} due to BULK API limitations: {list(properties_not_supported_by_bulk)}"
)
return "bulk"
if properties_not_supported_by_bulk:
return "rest"
return "bulk"
@classmethod
def _get_stream_type(cls, stream_name: str, api_type: str):
"""Get proper stream class: full_refresh, incremental or substream
SubStreams (like ContentDocumentLink) do not support incremental sync because of query restrictions, look here:
https://developer.salesforce.com/docs/atlas.en-us.object_reference.meta/object_reference/sforce_api_objects_contentdocumentlink.htm
"""
parent_name = PARENT_SALESFORCE_OBJECTS.get(stream_name, {}).get("parent_name")
if api_type == "rest":
full_refresh = RestSalesforceSubStream if parent_name else RestSalesforceStream
incremental = IncrementalRestSalesforceStream
elif api_type == "bulk":
full_refresh = BulkSalesforceSubStream if parent_name else BulkSalesforceStream
incremental = BulkIncrementalSalesforceStream
else:
raise Exception(f"Stream {stream_name} cannot be processed by REST or BULK API.")
return full_refresh, incremental
def prepare_stream(self, stream_name: str, json_schema, sobject_options, sf_object, authenticator, config):
"""Choose proper stream class: syncMode(full_refresh/incremental), API type(Rest/Bulk), SubStream"""
pk, replication_key = sf_object.get_pk_and_replication_key(json_schema)
stream_kwargs = {
"stream_name": stream_name,
"schema": json_schema,
"pk": pk,
"sobject_options": sobject_options,
"sf_api": sf_object,
"authenticator": authenticator,
"start_date": config.get("start_date"),
"job_tracker": self._job_tracker,
"message_repository": self.message_repository,
}
api_type = self._get_api_type(stream_name, json_schema, config.get("force_use_bulk_api", False))
full_refresh, incremental = self._get_stream_type(stream_name, api_type)
if replication_key and stream_name not in UNSUPPORTED_FILTERING_STREAMS:
stream_class = incremental
stream_kwargs["replication_key"] = replication_key
stream_kwargs["stream_slice_step"] = config.get("stream_slice_step", "P30D")
else:
stream_class = full_refresh
return stream_class, stream_kwargs
def generate_streams(
self,
config: Mapping[str, Any],
stream_objects: Mapping[str, Any],
sf_object: Salesforce,
) -> List[Stream]:
"""Generates a list of stream by their names. It can be used for different tests too"""
authenticator = TokenAuthenticator(sf_object.access_token)
schemas = sf_object.generate_schemas(stream_objects)
default_args = [sf_object, authenticator, config]
streams = []
state_manager = ConnectorStateManager(state=self.state)
for stream_name, sobject_options in stream_objects.items():
json_schema = schemas.get(stream_name, {})
stream_class, kwargs = self.prepare_stream(stream_name, json_schema, sobject_options, *default_args)
parent_name = PARENT_SALESFORCE_OBJECTS.get(stream_name, {}).get("parent_name")
if parent_name:
# get minimal schema required for getting proper class name full_refresh/incremental, rest/bulk
parent_schema = PARENT_SALESFORCE_OBJECTS.get(stream_name, {}).get("schema_minimal")
parent_class, parent_kwargs = self.prepare_stream(parent_name, parent_schema, sobject_options, *default_args)
kwargs["parent"] = parent_class(**parent_kwargs)
stream = stream_class(**kwargs)
api_type = self._get_api_type(stream_name, json_schema, config.get("force_use_bulk_api", False))
if api_type == "rest" and not stream.primary_key and stream.too_many_properties:
logger.warning(
f"Can not instantiate stream {stream_name}. It is not supported by the BULK API and can not be "
"implemented via REST because the number of its properties exceeds the limit and it lacks a primary key."
)
continue
streams.append(self._wrap_for_concurrency(config, stream, state_manager))
streams.append(self._wrap_for_concurrency(config, Describe(sf_api=sf_object, catalog=self.catalog), state_manager))
return streams
def _wrap_for_concurrency(self, config, stream, state_manager):
stream_slicer_cursor = None
if stream.cursor_field:
stream_slicer_cursor = self._create_stream_slicer_cursor(config, state_manager, stream)
if hasattr(stream, "set_cursor"):
stream.set_cursor(stream_slicer_cursor)
if hasattr(stream, "parent") and hasattr(stream.parent, "set_cursor"):
stream_slicer_cursor = self._create_stream_slicer_cursor(config, state_manager, stream)
stream.parent.set_cursor(stream_slicer_cursor)
if not stream_slicer_cursor or self._get_sync_mode_from_catalog(stream) == SyncMode.full_refresh:
cursor = FinalStateCursor(
stream_name=stream.name, stream_namespace=stream.namespace, message_repository=self.message_repository
)
state = None
else:
cursor = stream_slicer_cursor
state = cursor.state
return StreamFacade.create_from_stream(stream, self, logger, state, cursor)
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
if not config.get("start_date"):
config["start_date"] = (datetime.now() - relativedelta(years=self.START_DATE_OFFSET_IN_YEARS)).strftime(self.DATETIME_FORMAT)
sf = self._get_sf_object(config)
stream_objects = sf.get_validated_streams(config=config, catalog=self.catalog)
streams = self.generate_streams(config, stream_objects, sf)
return streams
def _create_stream_slicer_cursor(
self, config: Mapping[str, Any], state_manager: ConnectorStateManager, stream: Stream
) -> ConcurrentCursor:
"""
We have moved the generation of stream slices to the concurrent CDK cursor
"""
cursor_field_key = stream.cursor_field or ""
if not isinstance(cursor_field_key, str):
raise AssertionError(f"Nested cursor field are not supported hence type str is expected but got {cursor_field_key}.")
cursor_field = CursorField(cursor_field_key)
stream_state = state_manager.get_stream_state(stream.name, stream.namespace)
return ConcurrentCursor(
stream.name,
stream.namespace,
stream_state,
self.message_repository,
state_manager,
stream.state_converter,
cursor_field,
self._get_slice_boundary_fields(stream, state_manager),
datetime.fromtimestamp(pendulum.parse(config["start_date"]).timestamp(), timezone.utc),
stream.state_converter.get_end_provider(),
timedelta(seconds=LOOKBACK_SECONDS),
isodate.parse_duration(config["stream_slice_step"]) if "stream_slice_step" in config else timedelta(days=30),
)
def _get_slice_boundary_fields(self, stream: Stream, state_manager: ConnectorStateManager) -> Optional[Tuple[str, str]]:
return ("start_date", "end_date")
def _get_sync_mode_from_catalog(self, stream: Stream) -> Optional[SyncMode]:
if self.catalog:
for catalog_stream in self.catalog.streams:
if stream.name == catalog_stream.stream.name:
return catalog_stream.sync_mode
return None
def read(
self,
logger: logging.Logger,
config: Mapping[str, Any],
catalog: ConfiguredAirbyteCatalog,
state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]] = None,
) -> Iterator[AirbyteMessage]:
# save for use inside streams method
self.catalog = catalog
try:
yield from super().read(logger, config, catalog, state)
except AirbyteStopSync:
logger.info(f"Finished syncing {self.name}")
def _read_stream(
self,
logger: logging.Logger,
stream_instance: Stream,
configured_stream: ConfiguredAirbyteStream,
state_manager: ConnectorStateManager,
internal_config: InternalConfig,
) -> Iterator[AirbyteMessage]:
try:
yield from super()._read_stream(logger, stream_instance, configured_stream, state_manager, internal_config)
except exceptions.HTTPError as error:
error_data = error.response.json()[0]
error_code = error_data.get("errorCode")
url = error.response.url
if error.response.status_code == codes.FORBIDDEN and error_code == "REQUEST_LIMIT_EXCEEDED":
logger.warning(f"API Call {url} limit is exceeded. Error message: '{error_data.get('message')}'")
raise AirbyteStopSync() # if got 403 rate limit response, finish the sync with success.
raise error