diff --git a/.bumpversion.cfg b/.bumpversion.cfg index f5eef564aef7b..473604b9ad951 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.35.12-alpha +current_version = 0.40.0-alpha commit = False tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-[a-z]+)? @@ -12,20 +12,30 @@ serialize = [bumpversion:file:airbyte-bootloader/Dockerfile] -[bumpversion:file:airbyte-scheduler/app/Dockerfile] +[bumpversion:file:airbyte-container-orchestrator/Dockerfile] -[bumpversion:file:airbyte-server/Dockerfile] +[bumpversion:file:airbyte-metrics/reporter/Dockerfile] -[bumpversion:file:airbyte-workers/Dockerfile] - -[bumpversion:file:airbyte-container-orchestrator/Dockerfile] +[bumpversion:file:airbyte-server/Dockerfile] [bumpversion:file:airbyte-webapp/package.json] [bumpversion:file:airbyte-webapp/package-lock.json] +[bumpversion:file:airbyte-workers/Dockerfile] + [bumpversion:file:charts/airbyte/Chart.yaml] +[bumpversion:file:charts/airbyte-worker/Chart.yaml] + +[bumpversion:file:charts/airbyte-temporal/Chart.yaml] + +[bumpversion:file:charts/airbyte-webapp/Chart.yaml] + +[bumpversion:file:charts/airbyte-server/Chart.yaml] + +[bumpversion:file:charts/airbyte-bootloader/Chart.yaml] + [bumpversion:file:charts/airbyte/values.yaml] [bumpversion:file:charts/airbyte/README.md] @@ -39,3 +49,13 @@ serialize = [bumpversion:file:kube/overlays/stable-with-resource-limits/.env] [bumpversion:file:kube/overlays/stable-with-resource-limits/kustomization.yaml] + +[bumpversion:file:octavia-cli/install.sh] + +[bumpversion:file:octavia-cli/README.md] + +[bumpversion:file:octavia-cli/Dockerfile] + +[bumpversion:file:octavia-cli/setup.py] +serialize = + {major}.{minor}.{patch} diff --git a/.env b/.env index 3e351401939e9..a8f8f95d64828 100644 --- a/.env +++ b/.env @@ -10,7 +10,7 @@ ### SHARED ### -VERSION=0.35.12-alpha +VERSION=0.40.0-alpha # When using the airbyte-db via default docker image CONFIG_ROOT=/data @@ -40,7 +40,7 @@ DATABASE_PASSWORD=docker DATABASE_HOST=db DATABASE_PORT=5432 DATABASE_DB=airbyte -# translate manually DATABASE_URL=jdbc:postgresql://${DATABASE_HOST}:${DATABASE_PORT/${DATABASE_DB} (do not include the username or password here) +# translate manually DATABASE_URL=jdbc:postgresql://${DATABASE_HOST}:${DATABASE_PORT}/${DATABASE_DB} (do not include the username or password here) DATABASE_URL=jdbc:postgresql://db:5432/airbyte JOBS_DATABASE_MINIMUM_FLYWAY_MIGRATION_VERSION=0.29.15.001 @@ -48,7 +48,7 @@ JOBS_DATABASE_MINIMUM_FLYWAY_MIGRATION_VERSION=0.29.15.001 CONFIG_DATABASE_USER= CONFIG_DATABASE_PASSWORD= CONFIG_DATABASE_URL= -CONFIGS_DATABASE_MINIMUM_FLYWAY_MIGRATION_VERSION=0.35.1.001 +CONFIGS_DATABASE_MINIMUM_FLYWAY_MIGRATION_VERSION=0.35.15.001 ### AIRBYTE SERVICES ### TEMPORAL_HOST=airbyte-temporal:7233 @@ -70,24 +70,34 @@ JOB_MAIN_CONTAINER_MEMORY_LIMIT= ### LOGGING/MONITORING/TRACKING ### TRACKING_STRATEGY=segment +JOB_ERROR_REPORTING_STRATEGY=logging # Although not present as an env var, expected by Log4J configuration. LOG_LEVEL=INFO -# Although not present as an env var, helps Airbyte track job healthiness. -SENTRY_DSN="https://d4b03de0c4574c78999b8d58e55243dc@o1009025.ingest.sentry.io/6102835" ### APPLICATIONS ### -# Scheduler # -# Relevant to scaling. -SUBMITTER_NUM_THREADS=10 - # Worker # # Relevant to scaling. MAX_SYNC_WORKERS=5 MAX_SPEC_WORKERS=5 MAX_CHECK_WORKERS=5 MAX_DISCOVER_WORKERS=5 +# Temporal Activity configuration +ACTIVITY_MAX_ATTEMPT= +ACTIVITY_INITIAL_DELAY_BETWEEN_ATTEMPTS_SECONDS= +ACTIVITY_MAX_DELAY_BETWEEN_ATTEMPTS_SECONDS= +WORKFLOW_FAILURE_RESTART_DELAY_SECONDS= ### FEATURE FLAGS ### -NEW_SCHEDULER=false +AUTO_DISABLE_FAILING_CONNECTIONS=false +EXPOSE_SECRETS_IN_EXPORT=false +FORCE_MIGRATE_SECRET_STORE=false + +### MONITORING FLAGS ### +# Accepted values are datadog and otel (open telemetry) +METRIC_CLIENT= +# Useful only when metric client is set to be otel. Must start with http:// or https://. +OTEL_COLLECTOR_ENDPOINT="http://host.docker.internal:4317" + +USE_STREAM_CAPABLE_STATE=true diff --git a/.env.dev b/.env.dev index e7a4f02b7d5b0..2a6dc7eb8129b 100644 --- a/.env.dev +++ b/.env.dev @@ -27,4 +27,3 @@ SYNC_JOB_MAX_TIMEOUT_DAYS=3 # Sentry SENTRY_DSN="" - diff --git a/.gitbook.yaml b/.gitbook.yaml deleted file mode 100644 index 82a85043a8317..0000000000000 --- a/.gitbook.yaml +++ /dev/null @@ -1,75 +0,0 @@ -root: ./docs/ - -structure: - readme: ../README.md - summary: SUMMARY.md - -redirects: - architecture/cdc: ./understanding-airbyte/cdc.md - architecture/catalog: ./understanding-airbyte/catalog.md - architecture/airbyte-specification: ./understanding-airbyte/airbyte-specification.md - architecture/basic-normalization: ./understanding-airbyte/basic-normalization.md - architecture/connections: ./understanding-airbyte/connections.md - architecture/connections/full-refresh-overwrite: ./understanding-airbyte/connections/full-refresh-overwrite.md - architecture/connections/full-refresh-append: ./understanding-airbyte/connections/full-refresh-append.md - architecture/connections/incremental-append: ./understanding-airbyte/connections/incremental-append.md - architecture/https://docs.airbyte.io/understanding-airbyte/connections/incremental-deduped-history: ./understanding-airbyte/https://docs.airbyte.io/understanding-airbyte/connections/incremental-deduped-history.md - architecture/high-level-view: ./understanding-airbyte/high-level-view.md - architecture/jobs: ./understanding-airbyte/jobs.md - architecture/tech-stack: ./understanding-airbyte/tech-stack.md - architecture/namespaces: ./understanding-airbyte/namespaces.md - architecture: ./understanding-airbyte.md - roadmap: ./project-overview/roadmap.md - changelog: ./project-overview/changelog.md - changelog/platform: ./project-overview/changelog/platform.md - changelog/connectors: ./project-overview/changelog/connectors.md - license: ./project-overview/licenses/README.md - tutorials/postgres-replication: ./examples/postgres-replication.md - tutorials/build-a-slack-activity-dashboard: ./examples/build-a-slack-activity-dashboard.md - tutorials/zoom-activity-dashboard: ./examples/zoom-activity-dashboard.md - tutorials/slack-history: ./examples/slack-history.md - tutorials/beginners-guide-to-catalog: ./tutorials/tutorials/beginners-guide-to-catalog.md - tutorials/toy-connector: ./tutorials/tutorials/build-a-connector-the-hard-way.md - tutorials/build-a-connector-the-hard-way: ./tutorials/tutorials/build-a-connector-the-hard-way.md - tutorials/adding-incremental-sync: ./tutorials/tutorials/adding-incremental-sync.md - tutorials/building-a-python-source: ./tutorials/tutorials/building-a-python-source.md - tutorials/transformations-with-sql: ./tutorials/transformation-and-normalization/transformations-with-sql.md - tutorials/transformations-with-dbt: ./tutorials/transformation-and-normalization/transformations-with-dbt.md - contributing-to-airbyte/cdk-tutorial-alpha: ./contributing-to-airbyte/python/README.md - integrations/connector-health: ./integrations.md - tutorials: ./operator-guides.md - tutorials/browsing-output-logs.md: ./operator-guides/browsing-output-logs.md - tutorials/locating-files-local-destination.md: ./operator-guides/locating-files-local-destination.md - tutorials/using-the-airflow-airbyte-operator.md: ./operator-guides/using-the-airflow-airbyte-operator.md - tutorials/transformation-and-normalization: ./operator-guides/transformation-and-normalization.md - tutorials/transformation-and-normalization/transformations-with-sql: ./operator-guides/transformation-and-normalization/transformations-with-sql.md - tutorials/transformation-and-normalization/transformations-with-dbt: ./operator-guides/transformation-and-normalization/transformations-with-dbt.md - tutorials/tutorials: ./contributing-to-airbyte/building-new-connector/tutorials.md - tutorials/tutorials/beginners-guide-to-catalog: ./contributing-to-airbyte/building-new-connector/tutorials/beginners-guide-to-catalog.md - tutorials/tutorials/build-a-connector-the-hard-way: ./contributing-to-airbyte/building-new-connector/tutorials/build-a-connector-the-hard-way.md - tutorials/tutorials/adding-incremental-sync: ./contributing-to-airbyte/building-new-connector/tutorials/adding-incremental-sync.md - tutorials/tutorials/building-a-python-source: ./contributing-to-airbyte/building-new-connector/tutorials/building-a-python-source.md - upgrading-airbyte: ./operator-guides/upgrading-airbyte.md - tutorials/upgrading-airbyte: ./operator-guides/upgrading-airbyte.md - contributing-to-airbyte/python: ./connector-development/cdk-python.md - contributing-to-airbyte/python/concepts/basic-concepts: ./connector-development/cdk-python/basic-concepts.md - contributing-to-airbyte/python/concepts/schemas: ./connector-development/cdk-python/schemas.md - contributing-to-airbyte/python/concepts/full-refresh-stream: ./connector-development/cdk-python/full-refresh-stream.md - contributing-to-airbyte/python/concepts/incremental-stream: ./connector-development/cdk-python/incremental-stream.md - contributing-to-airbyte/python/concepts/http-streams: ./connector-development/cdk-python/http-streams.md - contributing-to-airbyte/python/concepts/python-concepts: ./connector-development/cdk-python/python-concepts.md - contributing-to-airbyte/python/concepts/stream_slices: ./connector-development/cdk-python/stream-slices.md - contributing-to-airbyte/python/tutorials: ./connector-development/tutorials.md - contributing-to-airbyte/python/tutorials/cdk-speedrun: ./connector-development/tutorials/cdk-speedrun.md - contributing-to-airbyte/python/tutorials/cdk-tutorial-python-http: ./connector-development/tutorials/cdk-tutorial-python-http.md - contributing-to-airbyte/building-new-connector: ./connector-development.md - contributing-to-airbyte/building-new-connector/best-practices: ./connector-development.md/best-practices.md - contributing-to-airbyte/building-new-connector/monorepo-python-development: ./contributing-to-airbyte/monorepo-python-development.md - contributing-to-airbyte/building-new-connector/testing-connectors: ./connector-development/testing-connectors.md - contributing-to-airbyte/building-new-connector/source-acceptance-tests: ./connector-development/testing-connectors/source-acceptance-tests-reference.md - contributing-to-airbyte/building-new-connector/tutorials: ./connector-development/tutorials.md - contributing-to-airbyte/building-new-connector/tutorials/beginners-guide-to-catalog: ./understanding-airbyte/beginners-guide-to-catalog.md - contributing-to-airbyte/building-new-connector/tutorials/building-a-python-source: ./connector-development/tutorials/building-a-python-source.md - contributing-to-airbyte/building-new-connector/tutorials/building-a-python-destination: ./connector-development/tutorials/building-a-python-destination.md - contributing-to-airbyte/building-new-connector/tutorials/building-a-java-destination: ./connector-development/tutorials/building-a-java-destination.md - project-overview/code-of-conduct: ./project-overview/slack-code-of-conduct diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000000..8a84703badef3 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,46 @@ +# Frontend code +/airbyte-webapp-e2e-tests/ @airbytehq/frontend +/airbyte-webapp/ @airbytehq/frontend +## Exclude the package(-lock).json from code ownership to prevent version bump PRs from triggering codeowners review +/airbyte-webapp/package.json +/airbyte-webapp/package-lock.json + +# CDK and SAT +/airbyte-cdk/ @airbytehq/api-connectors-dx +/airbyte-integrations/bases/source-acceptance-tests/ @airbytehq/api-connectors-dx +/airbyte-integrations/connector-templates/ @airbytehq/api-connectors-dx + +# Protocol related items +/airbyte-protocol/ @airbytehq/protocol-reviewers +/docs/understanding-airbyte/airbyte-protocol.md @airbytehq/protocol-reviewers + +# Normalization +/airbyte-integrations/bases/base-normalization/ @airbytehq/normalization + +# JDBC-based connectors +/airbyte-integrations/bases/base-java/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-jdbc/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-bigquery/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-clickhouse/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-cockroachdb/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-db2/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-mssql/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-mysql/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-oracle/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-postgres/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-redshift/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-snowflake/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-tidb/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-jdbc/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-azure-blob-storage/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-clickhouse/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-databricks/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-gcs/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-mariadb-columnstore/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-mysql/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-mssql/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-oracle/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-postgres/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-redshift/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/destination-rockset/ @airbytehq/jdbc-connectors +/airbyte-integrations/connectors/source-snowflake/ @airbytehq/jdbc-connectors \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md index 7f9e9cd9ce825..31f6106dfc6b5 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.md +++ b/.github/ISSUE_TEMPLATE/bug-report.md @@ -8,6 +8,7 @@ assignees: '' --- - **Destination Connector and version**: (if applicable example Postgres 0.3.3) -- **Severity**: Very Low / Low / Medium / High / Critical - **Step where error happened**: Deploy / Sync job / Setup new connection / Update connector / Upgrade Airbyte ## Current Behavior @@ -38,22 +38,14 @@ You can remove the examples bellow and fill out with your information. *Tell us what should happen.* ## Logs -*If applicable, please upload the logs from the failing operation. -For sync jobs, you can download the full logs from the UI by going to the sync attempt page and -clicking the download logs button at the top right of the logs display window.* - -
-LOG - -``` + ## Steps to Reproduce 1. diff --git a/.github/actions/build-and-push-branch/action.yml b/.github/actions/build-and-push-branch/action.yml new file mode 100644 index 0000000000000..ace8e43b3adf0 --- /dev/null +++ b/.github/actions/build-and-push-branch/action.yml @@ -0,0 +1,33 @@ +name: "Build OSS Branch and Push Minimum Required OSS Images" +description: "Build jars and docker images tagged for a particular branch. Primarily used for running OSS branch code in Cloud." +inputs: + branch_version_tag: + description: 'Used to tag jars and docker images with a branch-specific version (should use the form "dev-" to pass AirbyteVersion validation)' + required: false + dockerhub_username: + description: "Used to log in to dockerhub for pushing images" + required: true + dockerhub_token: + description: "Used to log in to dockerhub for pushing images" + required: true +runs: + using: "composite" + steps: + - name: Build + id: build + uses: ./.github/actions/build-branch + with: + branch_version_tag: ${{ inputs.branch_version_tag }} + + - name: Login to Docker (on Master) + uses: docker/login-action@v1 + with: + username: ${{ inputs.dockerhub_username }} + password: ${{ inputs.dockerhub_token }} + + - name: Push Docker Images + run: | + GIT_REVISION=$(git rev-parse HEAD) + [ [ -z "$GIT_REVISION" ] ] && echo "Couldn't get the git revision..." && exit 1 + VERSION=${{ steps.build.outputs.branch_version_tag }} GIT_REVISION=$GIT_REVISION docker-compose -f docker-compose-cloud.build.yaml push + shell: bash diff --git a/.github/actions/build-branch/action.yml b/.github/actions/build-branch/action.yml new file mode 100644 index 0000000000000..fff73d4b55c41 --- /dev/null +++ b/.github/actions/build-branch/action.yml @@ -0,0 +1,52 @@ +name: "Build OSS Branch" +description: "Build jars and docker images tagged for a particular branch. Primarily used for running OSS branch code in Cloud." +inputs: + branch_version_tag: + description: 'Used to tag jars and docker images with a branch-specific version (should use the form "dev-" to pass AirbyteVersion validation)' + required: false +outputs: + branch_version_tag: + description: 'Tag used for jars and docker images. Either user specified or auto generated as `dev-`' + value: ${{ steps.parse-input.outputs.branch_version_tag }} +runs: + using: "composite" + steps: + - name: "Parse Input" + id: parse-input + shell: bash + run: |- + # if the *branch_version_tag* input param is not specified, then generate it as 'dev-` + # + [[ "${{ inputs.branch_version_tag }}" != '' ]] && echo "::set-output name=branch_version_tag::${{ inputs.branch_version_tag }}" \ + || { short_hash=$(git rev-parse --short HEAD); echo "::set-output name=branch_version_tag::dev-$short_hash"; } + + - uses: actions/setup-java@v1 + with: + java-version: "17" + + - uses: actions/setup-node@v2 + with: + node-version: "lts/gallium" + + - name: Set up CI Gradle Properties + run: | + mkdir -p ~/.gradle/ + cat > ~/.gradle/gradle.properties < - -Dsonar.projectKey=${{ steps.create-sq-project.outputs.sq_project_name }} - -Dsonar.verbose=true - -Dsonar.working.directory=/tmp/scannerwork - -Dsonar.language=${{ inputs.module-lang }} - -Dsonar.sourceEncoding=UTF-8 - -Dsonar.projectBaseDir=${{ steps.create-sq-project.outputs.module_dir }} - -Dsonar.exclusions=reports/**,*.toml - -Dsonar.externalIssuesReportPaths=${{ steps.sq-options.outputs.external_reports }} - ${{ steps.sq-options.outputs.options }} + projectBaseDir: ${{ steps.create-sq-project.outputs.module_dir }} + args: > + -Dsonar.projectKey=${{ steps.create-sq-project.outputs.sq_project_name }} + -Dsonar.verbose=true + -Dsonar.working.directory=/tmp/scannerwork + -Dsonar.language=${{ inputs.module-lang }} + -Dsonar.sourceEncoding=UTF-8 + -Dsonar.projectBaseDir=${{ steps.create-sq-project.outputs.module_dir }} + -Dsonar.exclusions=reports/**,*.toml,*_tests/**,setup.py,main.py + -Dsonar.externalIssuesReportPaths=${{ steps.sq-options.outputs.external_reports }} + ${{ steps.sq-options.outputs.options }} - name: Generate SonarQube Report shell: bash diff --git a/.github/actions/start-aws-runner/action.yml b/.github/actions/start-aws-runner/action.yml index 635bb7f5cbc01..c3b94df610b6a 100644 --- a/.github/actions/start-aws-runner/action.yml +++ b/.github/actions/start-aws-runner/action.yml @@ -8,8 +8,8 @@ inputs: github-token: required: true ec2-image-id: - # github-self-hosted-runner-ubuntu-20-100g-disk - default: "ami-0ccd67e0abd945eec" + # github-self-hosted-runner-ubuntu-20-100g-disk-with-cypress-deps + default: "ami-0f23be2f917510c26" required: true ec2-instance-type: default: "c5.2xlarge" @@ -41,7 +41,7 @@ runs: aws-region: us-east-2 - name: Start EC2 runner id: start-ec2-runner - uses: machulav/ec2-github-runner@v2.3.2 + uses: airbytehq/ec2-github-runner@base64v1.1.0 with: mode: start github-token: ${{ inputs.github-token }} @@ -49,6 +49,9 @@ runs: ec2-instance-type: ${{ inputs.ec2-instance-type }} subnet-id: ${{ inputs.subnet-id }} security-group-id: ${{ inputs.security-group-id }} + # this adds a label to group any EC2 runners spun up within the same action run + # this enables creating a pool of runners to run multiple/matrix jobs on in parallel + label: runner-pool-${{ github.run_id }} aws-resource-tags: > [ {"Key": "BuildType", "Value": "oss"}, diff --git a/.github/labeler.yml b/.github/labeler.yml index 4b9b62c8f0308..efdbb9f069f87 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -17,7 +17,7 @@ area/platform: - charts/* - charts/**/* -area/frontend: +team/frontend: - airbyte-webapp/* - airbyte-webapp/**/* diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 4ed18fdfc2749..dec493beec5c1 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -13,79 +13,90 @@ Are there any breaking changes? What is the end result perceived by the user? If yes, please merge this PR with the 🚨🚨 emoji so changelog authors can further highlight this if needed. ## Pre-merge Checklist -Expand the relevant checklist and delete the others. +Expand the relevant checklist and delete the others. -
New Connector -

+

New Connector + +### Community member or Airbyter -#### Community member or Airbyter - - [ ] **Community member?** Grant edit access to maintainers ([instructions](https://docs.github.com/en/github/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork#enabling-repository-maintainer-permissions-on-existing-pull-requests)) -- [ ] Secrets in the connector's spec are annotated with `airbyte_secret` +- [ ] Secrets in the connector's spec are annotated with `airbyte_secret` - [ ] Unit & integration tests added and passing. Community members, please provide proof of success locally e.g: screenshot or copy-paste unit, integration, and acceptance test output. To run acceptance tests for a Python connector, follow instructions in the README. For java connectors run `./gradlew :airbyte-integrations:connectors::integrationTest`. - [ ] Code reviews completed -- [ ] Documentation updated +- [ ] Documentation updated - [ ] Connector's `README.md` - [ ] Connector's `bootstrap.md`. See [description and examples](https://docs.google.com/document/d/1ypdgmwmEHWv-TrO4_YOQ7pAJGVrMp5BOkEVh831N260/edit?usp=sharing) - - [ ] `docs/SUMMARY.md` - [ ] `docs/integrations//.md` including changelog. See changelog [example](https://docs.airbyte.io/integrations/sources/stripe#changelog) - [ ] `docs/integrations/README.md` - [ ] `airbyte-integrations/builds.md` - [ ] PR name follows [PR naming conventions](https://docs.airbyte.io/contributing-to-airbyte/updating-documentation#issues-and-pull-requests) - -#### Airbyter -If this is a community PR, the Airbyte engineer reviewing this PR is responsible for the below items. - +### Airbyter + +If this is a community PR, the Airbyte engineer reviewing this PR is responsible for the below items. + - [ ] Create a non-forked branch based on this PR and test the below items on it - [ ] Build is successful -- [ ] Credentials added to Github CI. [Instructions](https://docs.airbyte.io/connector-development#using-credentials-in-ci). -- [ ] [`/test connector=connectors/` command](https://docs.airbyte.io/connector-development#updating-an-existing-connector) is passing. +- [ ] If new credentials are required for use in CI, add them to GSM. [Instructions](https://docs.airbyte.io/connector-development#using-credentials-in-ci). +- [ ] [`/test connector=connectors/` command](https://docs.airbyte.io/connector-development#updating-an-existing-connector) is passing - [ ] New Connector version released on Dockerhub by running the `/publish` command described [here](https://docs.airbyte.io/connector-development#updating-an-existing-connector) - [ ] After the connector is published, connector added to connector index as described [here](https://docs.airbyte.io/connector-development#publishing-a-connector) - [ ] Seed specs have been re-generated by building the platform and committing the changes to the seed spec files, as described [here](https://docs.airbyte.io/connector-development#publishing-a-connector) - -

+
+
Updating a connector + +### Community member or Airbyter -
Updating a connector -

- -#### Community member or Airbyter - - [ ] Grant edit access to maintainers ([instructions](https://docs.github.com/en/github/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork#enabling-repository-maintainer-permissions-on-existing-pull-requests)) -- [ ] Secrets in the connector's spec are annotated with `airbyte_secret` +- [ ] Secrets in the connector's spec are annotated with `airbyte_secret` - [ ] Unit & integration tests added and passing. Community members, please provide proof of success locally e.g: screenshot or copy-paste unit, integration, and acceptance test output. To run acceptance tests for a Python connector, follow instructions in the README. For java connectors run `./gradlew :airbyte-integrations:connectors::integrationTest`. - [ ] Code reviews completed -- [ ] Documentation updated +- [ ] Documentation updated - [ ] Connector's `README.md` - [ ] Connector's `bootstrap.md`. See [description and examples](https://docs.google.com/document/d/1ypdgmwmEHWv-TrO4_YOQ7pAJGVrMp5BOkEVh831N260/edit?usp=sharing) - [ ] Changelog updated in `docs/integrations//.md` including changelog. See changelog [example](https://docs.airbyte.io/integrations/sources/stripe#changelog) - [ ] PR name follows [PR naming conventions](https://docs.airbyte.io/contributing-to-airbyte/updating-documentation#issues-and-pull-requests) - -#### Airbyter -If this is a community PR, the Airbyte engineer reviewing this PR is responsible for the below items. - +### Airbyter + +If this is a community PR, the Airbyte engineer reviewing this PR is responsible for the below items. + - [ ] Create a non-forked branch based on this PR and test the below items on it - [ ] Build is successful -- [ ] Credentials added to Github CI. [Instructions](https://docs.airbyte.io/connector-development#using-credentials-in-ci). -- [ ] [`/test connector=connectors/` command](https://docs.airbyte.io/connector-development#updating-an-existing-connector) is passing. -- [ ] New Connector version released on Dockerhub by running the `/publish` command described [here](https://docs.airbyte.io/connector-development#updating-an-existing-connector) -- [ ] After the new connector version is published, connector version bumped in the seed directory as described [here](https://docs.airbyte.io/connector-development#publishing-a-connector) -- [ ] Seed specs have been re-generated by building the platform and committing the changes to the seed spec files, as described [here](https://docs.airbyte.io/connector-development#publishing-a-connector) +- [ ] If new credentials are required for use in CI, add them to GSM. [Instructions](https://docs.airbyte.io/connector-development#using-credentials-in-ci). +- [ ] [`/test connector=connectors/` command](https://docs.airbyte.io/connector-development#updating-an-existing-connector) is passing +- [ ] New Connector version released on Dockerhub and connector version bumped by running the `/publish` command described [here](https://docs.airbyte.io/connector-development#updating-an-existing-connector) -

-
Connector Generator -

- +

Connector Generator + - [ ] Issue acceptance criteria met - [ ] PR name follows [PR naming conventions](https://docs.airbyte.io/contributing-to-airbyte/updating-documentation#issues-and-pull-requests) - [ ] If adding a new generator, add it to the [list of scaffold modules being tested](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connector-templates/generator/build.gradle#L41) - [ ] The generator test modules (all connectors with `-scaffold` in their name) have been updated with the latest scaffold by running `./gradlew :airbyte-integrations:connector-templates:generator:testScaffoldTemplates` then checking in your changes -- [ ] Documentation which references the generator is updated as needed. -

+- [ ] Documentation which references the generator is updated as needed + +
+ +## Tests + +
Unit + +*Put your unit tests output here.* + +
+ +
Integration + +*Put your integration tests output here.* + +
+ +
Acceptance + +*Put your acceptance tests output here.* +
diff --git a/.github/workflows/assign-issue-to-project.yml b/.github/workflows/assign-issue-to-project.yml new file mode 100644 index 0000000000000..2bb4061adc5f9 --- /dev/null +++ b/.github/workflows/assign-issue-to-project.yml @@ -0,0 +1,22 @@ +name: Assign to project + +on: + issues: + types: [labeled] + +env: + GH_PROJECT_TOKEN: ${{ secrets.OCTAVIA_PAT }} + +jobs: + # Simple assignment of issues to projects. + assign-to-project: + runs-on: ubuntu-latest + steps: + - uses: timroes/assign-to-project-action@0.1.0 + with: + token: ${{ env.GH_PROJECT_TOKEN }} + # Specify which label should get added to which project. The project number can be found + # as part of the URL after projects/ when viewing the project on GitHub. + projects: | + team/frontend=7 + project/onboarding-improvements=16 diff --git a/.github/workflows/build-connector-command.yml b/.github/workflows/build-connector-command.yml new file mode 100644 index 0000000000000..a9141849308dd --- /dev/null +++ b/.github/workflows/build-connector-command.yml @@ -0,0 +1,267 @@ +name: Bump, Build, Test Connectors [EXPERIMENTAL] +on: + workflow_dispatch: + inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master + connector: + description: "Airbyte Connector" + required: true + bump-version: + description: "Set to major, minor, or patch to automatically bump connectors version in Dockerfile, definitions.yaml and generate seed spec. You can also do this manually" + required: false + default: "false" + run-tests: + description: "Should run tests" + required: false + default: "true" + comment-id: + description: "The comment-id of the slash command. Used to update the comment with the status." + required: false + +jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} + ## Gradle Build + # In case of self-hosted EC2 errors, remove this block. + start-bump-build-test-connector-runner: + name: Start Build EC2 Runner + runs-on: ubuntu-latest + needs: find_valid_pat + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + # 80 gb disk + ec2-image-id: ami-0d648081937c75a73 + bump-build-test-connector: + name: Bump, Build, Test Connector + needs: start-bump-build-test-connector-runner + runs-on: ${{ needs.start-bump-build-test-connector-runner.outputs.label }} + environment: more-secrets + steps: + ############################ + ## SET UP ## + ############################ + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@v0 + with: + service_account_key: ${{ secrets.SPEC_CACHE_SERVICE_ACCOUNT_KEY }} + export_default_credentials: true + - name: Search for valid connector name format + id: regex + uses: AsasInnab/regex-action@v1 + with: + regex_pattern: "^(connectors|bases)/[a-zA-Z0-9-_]+$" + regex_flags: "i" # required to be set for this plugin + search_string: ${{ github.event.inputs.connector }} + - name: Validate input workflow format + if: steps.regex.outputs.first_match != github.event.inputs.connector + run: echo "The connector provided has an invalid format!" && exit 1 + - name: Link comment to workflow run + if: github.event.inputs.comment-id + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + > :clock2: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + token: ${{ secrets.OCTAVIA_PAT }} + - name: Install Java + uses: actions/setup-java@v1 + with: + java-version: "17" + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Install Pyenv and Tox + run: | + python3 -m pip install --quiet virtualenv==16.7.9 --user + python3 -m virtualenv venv + source venv/bin/activate + pip install --quiet tox==3.24.4 + - name: Install yq + if: github.event.inputs.bump-version != 'false' && success() + run: | + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys CC86BB64 + sudo add-apt-repository ppa:rmescandon/yq + sudo apt update + sudo apt install yq -y + - name: Test and install CI scripts + # all CI python packages have the prefix "ci_" + run: | + source venv/bin/activate + tox -r -c ./tools/tox_ci.ini + pip install --quiet -e ./tools/ci_* + - name: Get Credentials for ${{ github.event.inputs.connector }} + run: | + source venv/bin/activate + ci_credentials ${{ github.event.inputs.connector }} + # normalization also runs destination-specific tests, so fetch their creds also + if [ 'bases/base-normalization' = "${{ github.event.inputs.connector }}" ] || [ 'base-normalization' = "${{ github.event.inputs.connector }}" ]; then + ci_credentials destination-bigquery + ci_credentials destination-postgres + ci_credentials destination-snowflake + fi + env: + GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} + # TODO: seems like this should run in post-merge workflow + # - name: Prepare Sentry + # if: startsWith(github.event.inputs.connector, 'connectors') + # run: | + # curl -sL https://sentry.io/get-cli/ | bash + # - name: Create Sentry Release + # if: startsWith(github.event.inputs.connector, 'connectors') + # run: | + # sentry-cli releases set-commits "${{ env.IMAGE_NAME }}@${{ env.IMAGE_VERSION }}" --auto --ignore-missing + # env: + # SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_CONNECTOR_RELEASE_AUTH_TOKEN }} + # SENTRY_ORG: airbyte-5j + # SENTRY_PROJECT: airbyte-connectors + ############################ + ## BUMP ## + ############################ + - name: Bump Connector Version + if: github.event.inputs.bump-version != 'false' && success() + run: ./tools/integrations/manage.sh bump_version airbyte-integrations/${{ github.event.inputs.connector }} + - name: Commit and Push Version Bump + if: github.event.inputs.bump-version != 'false' && success() + run: | + git config user.name 'Octavia Squidington III' + git config user.email 'octavia-squidington-iii@users.noreply.github.com' + git add -u + git commit -m "bump-version ${{github.event.inputs.connector}}" + git push origin ${{ github.event.inputs.gitref }} + - name: Add Version Bump Success Comment + if: github.event.inputs.comment-id && github.event.inputs.bump-version != 'false' && success() + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + > :rocket: Bumped version for ${{github.event.inputs.connector}} + - name: Add Version Bump Failure Comment + if: github.event.inputs.comment-id && github.event.inputs.bump-version != 'false' && !success() + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + > :x: Couldn't bump version for ${{github.event.inputs.connector}} + ############################ + ## BUILD AND TEST ## + ############################ + - name: Build ${{ github.event.inputs.connector }} + run: ./tools/integrations/manage.sh build_experiment airbyte-integrations/${{ github.event.inputs.connector }} + id: build + env: + PR_NUMBER: ${{ github.event.number }} + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} + # Oracle expects this variable to be set. Although usually present, this is not set by default on Github virtual runners. + TZ: UTC + # - name: Test ${{ github.event.inputs.connector }} + # if: github.event.inputs.run-tests == 'true' + # run: ./tools/integrations/manage.sh test airbyte-integrations/${{ github.event.inputs.connector }} + # - name: Finalize Sentry release + # if: startsWith(github.event.inputs.connector, 'connectors') + # run: | + # sentry-cli releases finalize "${{ env.IMAGE_NAME }}@${{ env.IMAGE_VERSION }}" + # env: + # SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_CONNECTOR_RELEASE_AUTH_TOKEN }} + # SENTRY_ORG: airbyte-5j + # SENTRY_PROJECT: airbyte-connectors + # - name: Build and Test Success Comment + # if: github.event.inputs.comment-id && success() + # uses: peter-evans/create-or-update-comment@v1 + # with: + # comment-id: ${{ github.event.inputs.comment-id }} + # body: | + # > :rocket: Successfully built and tested ${{github.event.inputs.connector}} + # - name: Build and Test Failure Comment + # if: github.event.inputs.comment-id && !success() + # uses: peter-evans/create-or-update-comment@v1 + # with: + # comment-id: ${{ github.event.inputs.comment-id }} + # body: | + # > :x: Failed to build and test ${{github.event.inputs.connector}} + # - name: Slack Notification - Failure + # if: failure() + # uses: rtCamp/action-slack-notify@master + # env: + # SLACK_WEBHOOK: ${{ secrets.BUILD_SLACK_WEBHOOK }} + # SLACK_USERNAME: Buildozer + # SLACK_ICON: https://avatars.slack-edge.com/temp/2020-09-01/1342729352468_209b10acd6ff13a649a1.jpg + # SLACK_COLOR: DC143C + # SLACK_TITLE: "Failed to build and test connector ${{ github.event.inputs.connector }} from branch ${{ github.ref }}" + # SLACK_FOOTER: "" + # - name: Add Final Success Comment + # if: github.event.inputs.comment-id && success() + # uses: peter-evans/create-or-update-comment@v1 + # with: + # comment-id: ${{ github.event.inputs.comment-id }} + # body: | + # > :white_check_mark: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + # - name: Set publish label + # if: success() + # run: | + # echo "set some label on PR" + # In case of self-hosted EC2 errors, remove this block. + stop-bump-build-test-connector-runner: + name: Stop Build EC2 Runner + needs: + - start-bump-build-test-connector-runner # required to get output from the start-runner job + - bump-build-test-connector # required to wait when the main job is done + - find_valid_pat + runs-on: ubuntu-latest + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + - name: Stop EC2 runner + uses: supertopher/ec2-github-runner@base64v1.0.10 + with: + mode: stop + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ needs.start-bump-build-test-connector-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-bump-build-test-connector-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/build-report.yml b/.github/workflows/build-report.yml index 44ca93b45f6fb..8e181d2277f02 100644 --- a/.github/workflows/build-report.yml +++ b/.github/workflows/build-report.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: schedule: # 1pm UTC is 6am PDT. - - cron: '0 13 * * *' + - cron: "0 13 * * *" jobs: build-report: @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install requests slack_sdk + pip install requests slack_sdk pyyaml - name: create and send report run: python ./tools/bin/build_report.py env: @@ -32,6 +32,6 @@ jobs: SLACK_USERNAME: Build Report SLACK_ICON: https://avatars.slack-edge.com/temp/2020-09-01/1342729352468_209b10acd6ff13a649a1.jpg SLACK_COLOR: ${{ job.status }} - SLACK_TITLE: 'Failed to create build report' - SLACK_MESSAGE: 'https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}' + SLACK_TITLE: "Failed to create build report" + SLACK_MESSAGE: "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" MSG_MINIMAL: True diff --git a/.github/workflows/connector_integration_tests.yml b/.github/workflows/connector_integration_tests.yml index 3dc913eeb75fa..d827c8acb350f 100644 --- a/.github/workflows/connector_integration_tests.yml +++ b/.github/workflows/connector_integration_tests.yml @@ -14,7 +14,15 @@ jobs: steps: - name: Checkout Airbyte uses: actions/checkout@v2 + - name: Install Java + uses: actions/setup-java@v1 + with: + java-version: '17' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install PyYAML requests - name: Launch Integration Tests - run: ./tools/bin/ci_integration_workflow_launcher.sh + run: python ./tools/bin/ci_integration_workflow_launcher.py base-normalization source-acceptance-test source:beta source:GA destination:beta destination:GA env: GITHUB_TOKEN: ${{ secrets.SLASH_COMMAND_PAT }} diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml new file mode 100644 index 0000000000000..2cb8dfea3d494 --- /dev/null +++ b/.github/workflows/create-release.yml @@ -0,0 +1,71 @@ +# This is an action that runs when an Airbyte version bump is merged into master. +# It fetches the changelog from the version bump PR and automatically creates a +# Release for the version bump. +name: Create Release + +on: + push: + branches: + - master + +jobs: + create-release: + if: startsWith(github.event.head_commit.message, 'Bump Airbyte version') + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: read + steps: + - name: Fetch Version Bump PR Body + id: fetch_pr_body + env: + COMMIT_ID: ${{ github.event.head_commit.id }} + shell: bash + run: |- + set -x + PR=$(curl \ + -H "Accept: application/vnd.github.v3+json" \ + -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/commits/$COMMIT_ID/pulls) + # The printf helps escape characters so that jq can parse the output. + # The sed removes carriage returns so that the body is easier to parse later, and + # escapes backticks so that they are not executed as commands. + PR_BODY=$(printf '%s' "$PR" | jq '.[0].body' | sed 's/\\r//g' | sed 's/`/\\`/g') + echo ::set-output name=pr_body::${PR_BODY} + - name: Extract Changelog + id: extract_changelog + shell: bash + run: |- + set -x + PR_BODY=${{ steps.fetch_pr_body.outputs.pr_body}} + if [[ $PR_BODY = "null" ]]; then + echo "No PR body exists for this commit, so a release cannot be generated." + exit 1 + fi + # this regex extracts just the changelog contents + if [[ $PR_BODY =~ Changelog:(\\n)*(.*)\\n\\n ]]; then + CHANGELOG="${BASH_REMATCH[2]}" + else + echo "PR body does not match the changelog extraction regex" + exit 1 + fi + # save CHANGELOG into a multiline env var on the action itself, since Github Actions do not support outputting multiline strings well + echo "CHANGELOG<> $GITHUB_ENV + echo -e "$CHANGELOG" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Get Version + id: get_version + shell: bash + run: | + VERSION=$(grep -w VERSION .env | cut -d"=" -f2) + echo ::set-output name=VERSION::${VERSION} + - name: Create Release + id: create_release + uses: ncipollo/release-action@v1 + with: + body: ${{ env.CHANGELOG }} + token: ${{ secrets.GITHUB_TOKEN }} + prerelease: true + tag: v${{ steps.get_version.outputs.VERSION }} diff --git a/.github/workflows/fe-validate-links.yml b/.github/workflows/fe-validate-links.yml new file mode 100644 index 0000000000000..5ade0e19cee20 --- /dev/null +++ b/.github/workflows/fe-validate-links.yml @@ -0,0 +1,50 @@ +name: Frontend Link Validation + +on: + workflow_dispatch: + schedule: + - cron: '0 14 * * *' + +jobs: + validate-frontend-links: + name: "Validate frontend links" + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + + - uses: actions/setup-java@v1 + with: + java-version: "17" + + - uses: actions/setup-node@v2 + with: + node-version: "lts/gallium" + + - name: Set up CI Gradle Properties + run: | + mkdir -p ~/.gradle/ + cat > ~/.gradle/gradle.properties <- + {\"channel\":\"C03088BTMFC\", \"blocks\":[ + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\":alarm: The periodic link validation failed!\n\n\"}}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\"See details on \n\"}}]} \ No newline at end of file diff --git a/.github/workflows/github-community-label-bot.yaml b/.github/workflows/github-community-label-bot.yaml deleted file mode 100644 index 5b85037289494..0000000000000 --- a/.github/workflows/github-community-label-bot.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name: CI -on: - issues: - types: [opened] - pull_request_target: - types: [opened] - -jobs: - action: - runs-on: ubuntu-latest - steps: - - uses: harshithmullapudi/label-actions@75686c2b3de17244526f10a22424f319d0bc134f - with: - github-token: ${{ secrets.LABEL_BOT_TOKEN }} - github-username: ${{ secrets.LABEL_BOT_USERNAME }} diff --git a/.github/workflows/gke-kube-test-command.yml b/.github/workflows/gke-kube-test-command.yml index 05eedfc52409b..9c317d052e058 100644 --- a/.github/workflows/gke-kube-test-command.yml +++ b/.github/workflows/gke-kube-test-command.yml @@ -1,31 +1,58 @@ name: GKE Kube Acceptance Test on: - schedule: - - cron: '0 */6 * * *' workflow_dispatch: inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master comment-id: - description: 'The comment-id of the slash command. Used to update the comment with the status.' + description: "The comment-id of the slash command. Used to update the comment with the status." required: false jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} start-gke-kube-acceptance-test-runner: timeout-minutes: 10 name: Start GKE Kube Acceptance Test EC2 Runner runs-on: ubuntu-latest + needs: find_valid_pat outputs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} steps: - name: Checkout Airbyte uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Start AWS Runner id: start-ec2-runner uses: ./.github/actions/start-aws-runner with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} gke-kube-acceptance-test: # In case of self-hosted EC2 errors, removed the `needs` line and switch back to running on ubuntu-latest. needs: start-gke-kube-acceptance-test-runner # required to start the main job when the runner is ready @@ -43,14 +70,17 @@ jobs: - name: Checkout Airbyte uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - uses: actions/setup-java@v1 with: - java-version: '17' + java-version: "17" - - uses: actions/setup-node@v1 + - uses: actions/setup-node@v2 with: - node-version: '16.13.0' + node-version: "lts/gallium" - name: Fix EC-2 Runner run: | @@ -62,7 +92,7 @@ jobs: sudo apt-get install socat - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@master + uses: google-github-actions/setup-gcloud@v0 with: project_id: ${{ secrets.GKE_TEST_PROJECT_ID }} service_account_key: ${{ secrets.GKE_TEST_SA_KEY }} @@ -86,7 +116,8 @@ jobs: env: USER: root HOME: /home/runner - DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} ACTION_RUN_ID: ${{github.run_id}} run: | ./tools/bin/gke-kube-acceptance-test/acceptance_test_kube_gke.sh @@ -112,6 +143,7 @@ jobs: needs: - start-gke-kube-acceptance-test-runner # required to get output from the start-runner job - gke-kube-acceptance-test # required to wait when the main job is done + - find_valid_pat runs-on: ubuntu-latest if: ${{ always() }} steps: @@ -122,10 +154,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2.3.0 + uses: supertopher/ec2-github-runner@base64v1.0.10 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} label: ${{ needs.start-gke-kube-acceptance-test-runner.outputs.label }} ec2-instance-id: ${{ needs.start-gke-kube-acceptance-test-runner.outputs.ec2-instance-id }} - diff --git a/.github/workflows/gradle.yml b/.github/workflows/gradle.yml index f0b24c126e75d..fe5d8f26222b2 100644 --- a/.github/workflows/gradle.yml +++ b/.github/workflows/gradle.yml @@ -1,6 +1,8 @@ name: Airbyte CI on: + #ability to start task manually in Web UI + workflow_dispatch: schedule: - cron: "0 */1 * * *" push: @@ -8,10 +10,155 @@ on: - "gitbook/v1" jobs: - ## Gradle Build (Connectors Base) + # COMMON TASKS + ensure-images-exist: + name: "Ensure all required Docker images exist on Dockerhub" + timeout-minutes: 10 + runs-on: ubuntu-latest + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + + - name: Check images exist + run: ./tools/bin/check_images_exist.sh all + # The output of this job is used to trigger the following builds. + changes: + name: "Detect Modified Files" + # The filtering action does not deal with well scheduled events so skip to avoid errors. + # See https://github.com/dorny/paths-filter/issues/100 for more info. + # This is okay this workflow is only scheduled on master, where we want to build everything + # so filtering is not required. Use always() in each start block to force the start task. + if: github.event_name != 'schedule' + runs-on: ubuntu-latest + outputs: + backend: ${{ steps.filter.outputs.backend }} + build: ${{ steps.filter.outputs.build }} + cli: ${{ steps.filter.outputs.cli }} + connectors: ${{ steps.filter.outputs.connectors }} + db: ${{ steps.filter.outputs.db }} + frontend: ${{ steps.filter.outputs.frontend }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - uses: dorny/paths-filter@v2 + id: filter + with: + # Note, the following glob expression within a filters are ORs. + filters: | + backend: + - 'airbyte-!(cdk|integrations|webapp|webapp-e2e-tests)/**' + build: + - '.github/**' + - 'buildSrc/**' + - 'tools/**' + - '*.gradle' + cli: + - 'airbyte-api/**' + - 'octavia-cli/**' + connectors: + - 'airbyte-cdk/**' + - 'airbyte-protocol/**' + - 'airbyte-integrations/**' + db: + - 'airbyte-db/**' + frontend: + - 'airbyte-api/src/main/openapi/config.yaml' + - 'airbyte-webapp/**' + - 'airbyte-webapp-e2e-tests/**' + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} + + # Uncomment to debug. + # changes-output: + # name: "Debug Change Detection Logic" + # needs: changes + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v2 + # - run: | + # echo '${{ toJSON(needs) }}' + + ## BUILDS + octavia-cli-build: + needs: changes + runs-on: ubuntu-latest + # Because scheduled builds on master require us to skip the changes job. Use always() to force this to run on master. + if: needs.changes.outputs.cli == 'true' || needs.changes.outputs.build == 'true' || (always() && github.ref == 'refs/heads/master') + name: "Octavia CLI: Build" + timeout-minutes: 90 + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + + - name: Cache Build Artifacts + uses: ./.github/actions/cache-build-artifacts + with: + cache-key: ${{ secrets.CACHE_VERSION }} + cache-python: "false" + + - uses: actions/setup-java@v1 + with: + java-version: "17" + + - uses: actions/setup-python@v2 + with: + python-version: "3.9" + + - name: Set up CI Gradle Properties + run: | + mkdir -p ~/.gradle/ + cat > ~/.gradle/gradle.properties < ~/.gradle/gradle.properties </dev/null 2>&1; do - sleep 1 - done - - sudo apt-get update && sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libgbm-dev libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb + node-version: "lts/gallium" - name: Set up CI Gradle Properties run: | mkdir -p ~/.gradle/ cat > ~/.gradle/gradle.properties </dev/null 2>&1; do - sleep 1 - done - - sudo apt-get update - sudo apt-get install socat - - name: Create cluster config file run: | cat > /tmp/kind-config.yaml <- + {\"channel\":\"C03BEADRPNY\", \"blocks\":[ + {\"type\":\"divider\"}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\" Merge to OSS Master failed! :bangbang: \n\n\"}}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\"_merged by_: *${{ github.actor }}* \n\"}}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\" :octavia-shocked: :octavia-shocked: \n\"}}, + {\"type\":\"divider\"}]} + + notify-failure-slack-channel-fixed-broken-build: + name: "Notify Slack Channel on Build Fixes" + runs-on: ubuntu-latest + needs: + - build-connectors-base + - frontend-build + - octavia-cli-build + - platform-build + - kube-acceptance-test + if: success() + steps: + - name: Get Previous Workflow Status + uses: Mercymeilya/last-workflow-status@v0.3 + id: last_status + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + # To avoid clogging up the channel, only publish build success if the previous build was a failure since this means the build was fixed. + - name: Publish Build Fixed Message to OSS Build Failure Slack Channel + if: ${{ steps.last_status.outputs.last_status == 'failure' }} + uses: abinoda/slack-action@master + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN_AIRBYTE_TEAM }} + with: + args: >- + {\"channel\":\"C03BEADRPNY\", \"blocks\":[ + {\"type\":\"divider\"}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\" OSS Master Fixed! :white_check_mark: \n\n\"}}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\"_merged by_: *${{ github.actor }}* \n\"}}, + {\"type\":\"section\",\"text\":{\"type\":\"mrkdwn\",\"text\":\" :octavia-rocket: :octavia-rocket: \n\"}}, + {\"type\":\"divider\"}]} diff --git a/.github/workflows/notify-on-label.yml b/.github/workflows/notify-on-label.yml new file mode 100644 index 0000000000000..8f2999c8955e5 --- /dev/null +++ b/.github/workflows/notify-on-label.yml @@ -0,0 +1,17 @@ +name: Notify team + +on: + issues: + types: [labeled] + +jobs: + notify: + runs-on: ubuntu-latest + steps: + - uses: jenschelkopf/issue-label-notification-action@1.3 + with: + token: ${{ secrets.OCTAVIA_PAT }} + message: 'cc {recipients}' + # Specify a map of label -> team/user to notify + recipients: | + team/frontend=@airbytehq/frontend diff --git a/.github/workflows/notify-on-push-to-master.yml b/.github/workflows/notify-on-push-to-master.yml new file mode 100644 index 0000000000000..48f3a8f352711 --- /dev/null +++ b/.github/workflows/notify-on-push-to-master.yml @@ -0,0 +1,18 @@ +name: Notify Cloud of OSS Push to Master +on: + push: + branches: + - master + workflow_dispatch: + +jobs: + repo-sync: + runs-on: ubuntu-latest + steps: + - name: Repository Dispatch + uses: peter-evans/repository-dispatch@v2 + with: + token: ${{ secrets.OCTAVIA_PAT }} + repository: airbytehq/airbyte-cloud + event-type: oss-push-to-master + client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' diff --git a/.github/workflows/platform-project-automation.yml b/.github/workflows/platform-project-automation.yml deleted file mode 100644 index d53f2a508e241..0000000000000 --- a/.github/workflows/platform-project-automation.yml +++ /dev/null @@ -1,34 +0,0 @@ -# See https://github.com/marketplace/actions/project-beta-automations for guidance - -name: Platform Project Automation -on: - issues: - types: [labeled] - -env: - GH_PROJECT_TOKEN: ${{ secrets.PARKER_PAT_FOR_PLATFORM_PROJECT_AUTOMATION }} - ORG: airbytehq - PROJECT_ID: 6 # https://github.com/orgs/airbytehq/projects/6/views/8 - FIELD_STATUS: Status - STATUS_TODO: Todo - FIELD_DATE_ADDED: Date Added - -jobs: - add-area-platform-issues-to-platform-project: - runs-on: ubuntu-latest - name: Add area/platform issue to Platform Project - steps: - - name: Set current date env var - id: set_date - run: echo ::set-output name=CURRENT_DATE::$(date +'%Y-%m-%dT%H:%M:%S%z') - - - name: Add issue to project if labelled with area/platform - uses: leonsteinhaeuser/project-beta-automations@v1.1.0 - if: contains(github.event.issue.labels.*.name, 'area/platform') - with: - gh_token: ${{ env.GH_PROJECT_TOKEN }} - organization: ${{ env.ORG }} - project_id: ${{ env.PROJECT_ID }} - resource_node_id: ${{ github.event.issue.node_id }} - operation_mode: custom_field - custom_field_values: '[{\"name\": \"Status\",\"type\": \"single_select\",\"value\": \"${{ env.STATUS_TODO }}\"},{\"name\": \"${{ env.FIELD_DATE_ADDED }}\",\"type\": \"date\",\"value\": \"${{ steps.set_date.outputs.CURRENT_DATE }}\"}]' diff --git a/.github/workflows/publish-cdk-command.yml b/.github/workflows/publish-cdk-command.yml index 77b937f7a7d9b..2ca72bc3c184b 100644 --- a/.github/workflows/publish-cdk-command.yml +++ b/.github/workflows/publish-cdk-command.yml @@ -2,30 +2,39 @@ name: Publish CDK on: workflow_dispatch: inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master dry-run: description: 'By default dry-run publishes to Test PyPi. Use "false" to publish to actual PyPi servers.' required: false comment-id: - description: 'The comment-id of the slash command. Used to update the comment with the status.' + description: "The comment-id of the slash command. Used to update the comment with the status." required: false jobs: - build-cdk: runs-on: ubuntu-latest strategy: matrix: - # 3.7.1 - is a minimal of 3.7.X version supported by github actions - python-version: [3.7.1, 3.7, 3.8, 3.9] + python-version: ["3.9"] steps: - uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - uses: actions/setup-java@v1 with: - java-version: '17' + java-version: "17" - name: Checkout Airbyte uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Build CDK Package run: SUB_BUILD=CONNECTORS_BASE ./gradlew --no-daemon --no-build-cache :airbyte-cdk:python:build - name: Add Failure Comment @@ -56,13 +65,16 @@ jobs: echo "pypi_url=https://test.pypi.org/legacy/" >> $GITHUB_ENV - name: Checkout Airbyte uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Publish Python Package uses: mariamrf/py-package-publish-action@v1.1.0 with: # specify the same version as in ~/.python-version - python_version: '3.7.9' - pip_version: '21.1' - subdir: 'airbyte-cdk/python/' + python_version: "3.9.11" + pip_version: "21.1" + subdir: "airbyte-cdk/python/" env: TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} diff --git a/.github/workflows/publish-command.yml b/.github/workflows/publish-command.yml index 2770eaa8b35f7..242216dbc18ed 100644 --- a/.github/workflows/publish-command.yml +++ b/.github/workflows/publish-command.yml @@ -2,23 +2,57 @@ name: Publish Connector Image on: workflow_dispatch: inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master connector: description: "Airbyte Connector" required: true + comment-id: + description: "The comment-id of the slash command. Used to update the comment with the status." + required: false + auto-bump-version: + description: "after publishing, the workflow will automatically bump the connector version in definitions and generate seed spec" + required: true + default: "true" + parallel: + description: "Switching this to true will spin up 5 build agents instead of 1 and allow multi connector publishes to run in parallel" + required: true + default: "false" run-tests: description: "Should run tests when publishing" required: true default: "true" - comment-id: - description: "The comment-id of the slash command. Used to update the comment with the status." - required: false jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} ## Gradle Build # In case of self-hosted EC2 errors, remove this block. - start-publish-image-runner: - name: Start Build EC2 Runner + start-publish-image-runner-0: + name: Start Build EC2 Runner 0 runs-on: ubuntu-latest + needs: find_valid_pat outputs: label: ${{ steps.start-ec2-runner.outputs.label }} ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} @@ -26,23 +60,165 @@ jobs: - name: Checkout Airbyte uses: actions/checkout@v2 with: - repository: ${{github.event.pull_request.head.repo.full_name}} # always use the branch's repository + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Start AWS Runner id: start-ec2-runner uses: ./.github/actions/start-aws-runner with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} - # 80 gb disk - ec2-image-id: ami-0d648081937c75a73 + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ github.run_id }}-publisher + start-publish-image-runner-1: + if: github.event.inputs.parallel == 'true' && success() + name: Start Build EC2 Runner 1 + runs-on: ubuntu-latest + needs: find_valid_pat + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ github.run_id }}-publisher + start-publish-image-runner-2: + if: github.event.inputs.parallel == 'true' && success() + name: Start Build EC2 Runner 2 + runs-on: ubuntu-latest + needs: find_valid_pat + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ github.run_id }}-publisher + start-publish-image-runner-3: + if: github.event.inputs.parallel == 'true' && success() + name: Start Build EC2 Runner 3 + runs-on: ubuntu-latest + needs: find_valid_pat + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ github.run_id }}-publisher + start-publish-image-runner-4: + if: github.event.inputs.parallel == 'true' && success() + name: Start Build EC2 Runner 4 + runs-on: ubuntu-latest + needs: find_valid_pat + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ github.run_id }}-publisher + preprocess-matrix: + needs: start-publish-image-runner-0 + runs-on: ${{ needs.start-publish-image-runner-0.outputs.label }} + outputs: + connectorjson: ${{ steps.preprocess.outputs.connectorjson }} + steps: + # given a string input of a single connector or comma separated list of connectors e.g. connector1, connector2 + # this step builds an array, by removing whitespace, add in quotation marks around connectors and braces [ ] at the start and end + # finally, it sets it as output from this job so we can use this array of connectors as our matrix strategy for publishing + - id: preprocess + run: | + start="[\"" + replace="\",\"" + end="\"]" + stripped_connector="$(echo "${{ github.event.inputs.connector }}" | tr -d ' ')" + middle=${stripped_connector//,/$replace} + full="$start$middle$end" + echo "::set-output name=connectorjson::$full" + write-initial-output-to-comment: + name: Set up git comment + if: github.event.inputs.comment-id + needs: start-publish-image-runner-0 + runs-on: ${{ needs.start-publish-image-runner-0.outputs.label }} + steps: + - name: Print start message + if: github.event.inputs.comment-id && success() + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + > :clock2: Publishing the following connectors:
${{ github.event.inputs.connector }}
https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} + - name: Create table header + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | +
+ + | Connector | Did it publish? | Were definitions generated? | + - name: Create table separator + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + | --- | --- | --- | publish-image: - needs: start-publish-image-runner - runs-on: ${{ needs.start-publish-image-runner.outputs.label }} + timeout-minutes: 240 + needs: + - start-publish-image-runner-0 + - preprocess-matrix + - write-initial-output-to-comment + strategy: + max-parallel: 5 + fail-fast: false + matrix: + connector: ${{ fromJSON(needs.preprocess-matrix.outputs.connectorjson) }} + runs-on: runner-pool-${{ github.run_id }} environment: more-secrets steps: - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@master + uses: google-github-actions/setup-gcloud@v0 with: service_account_key: ${{ secrets.SPEC_CACHE_SERVICE_ACCOUNT_KEY }} export_default_credentials: true @@ -52,89 +228,221 @@ jobs: with: regex_pattern: "^(connectors|bases)/[a-zA-Z0-9-_]+$" regex_flags: "i" # required to be set for this plugin - search_string: ${{ github.event.inputs.connector }} + search_string: ${{ matrix.connector }} - name: Validate input workflow format - if: steps.regex.outputs.first_match != github.event.inputs.connector + if: steps.regex.outputs.first_match != matrix.connector run: echo "The connector provided has an invalid format!" && exit 1 - - name: Link comment to workflow run - if: github.event.inputs.comment-id - uses: peter-evans/create-or-update-comment@v1 - with: - comment-id: ${{ github.event.inputs.comment-id }} - body: | - > :clock2: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} - name: Checkout Airbyte uses: actions/checkout@v2 with: - repository: ${{github.event.pull_request.head.repo.full_name}} # always use the branch's repository - - name: Install Unzip for Databricks - if: github.event.inputs.connector == 'connectors/destination-databricks' - run: | - apt-get update && apt-get install -y unzip + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + token: ${{ secrets.OCTAVIA_PAT }} - name: Install Java uses: actions/setup-java@v1 with: - java-version: '17' + java-version: "17" + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: "3.9" - name: Install Pyenv and Tox - # Beside PyEnv, this does not set any runtimes up because it uses an AMI image that has everything pre-installed. See https://github.com/airbytehq/airbyte/issues/4559/ run: | python3 -m pip install --quiet virtualenv==16.7.9 --user + rm -r venv || echo "no pre-existing venv" python3 -m virtualenv venv source venv/bin/activate pip install --quiet tox==3.24.4 + - name: Install yq + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys CC86BB64 + sudo add-apt-repository ppa:rmescandon/yq + sudo apt update + sudo apt install yq -y - name: Test and install CI scripts # all CI python packages have the prefix "ci_" run: | source venv/bin/activate tox -r -c ./tools/tox_ci.ini pip install --quiet -e ./tools/ci_* - - name: Write Integration Test Credentials for ${{ github.event.inputs.connector }} + - name: Write Integration Test Credentials for ${{ matrix.connector }} run: | source venv/bin/activate - ci_credentials ${{ github.event.inputs.connector }} + ci_credentials ${{ matrix.connector }} + # normalization also runs destination-specific tests, so fetch their creds also + if [ 'bases/base-normalization' = "${{ matrix.connector }}" ] || [ 'base-normalization' = "${{ matrix.connector }}" ]; then + ci_credentials destination-bigquery + ci_credentials destination-postgres + ci_credentials destination-snowflake + fi env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - - run: | - echo "$SPEC_CACHE_SERVICE_ACCOUNT_KEY" > spec_cache_key_file.json && docker login -u airbytebot -p ${DOCKER_PASSWORD} - ./tools/integrations/manage.sh publish airbyte-integrations/${{ github.event.inputs.connector }} ${{ github.event.inputs.run-tests }} --publish_spec_to_cache - name: publish ${{ github.event.inputs.connector }} + - name: Set Name and Version Environment Vars + if: startsWith(matrix.connector, 'connectors') + run: | + source tools/lib/lib.sh + DOCKERFILE=airbyte-integrations/${{ matrix.connector }}/Dockerfile + echo "IMAGE_NAME=$(echo ${{ matrix.connector }} | cut -d"/" -f2)" >> $GITHUB_ENV + echo "IMAGE_VERSION=$(_get_docker_image_version ${DOCKERFILE})" >> $GITHUB_ENV + - name: Prepare Sentry + if: startsWith(matrix.connector, 'connectors') + run: | + curl -sL https://sentry.io/get-cli/ | bash || echo "sentry cli already installed" + - name: Publish ${{ matrix.connector }} + run: | + echo "$SPEC_CACHE_SERVICE_ACCOUNT_KEY" > spec_cache_key_file.json && docker login -u ${DOCKER_HUB_USERNAME} -p ${DOCKER_HUB_PASSWORD} + ./tools/integrations/manage.sh publish airbyte-integrations/${{ matrix.connector }} ${{ github.event.inputs.run-tests }} --publish_spec_to_cache id: publish env: - DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} # Oracle expects this variable to be set. Although usually present, this is not set by default on Github virtual runners. TZ: UTC - - name: Add Success Comment - if: github.event.inputs.comment-id && success() + - name: Create Sentry Release + if: startsWith(matrix.connector, 'connectors') && success() + run: | + SENTRY_RELEASE_NAME="airbyte-${{ env.IMAGE_NAME }}@${{ env.IMAGE_VERSION }}" + sentry-cli releases set-commits "$SENTRY_RELEASE_NAME" --auto --ignore-missing && + sentry-cli releases finalize "$SENTRY_RELEASE_NAME" + env: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_CONNECTOR_RELEASE_AUTH_TOKEN }} + SENTRY_ORG: airbytehq + SENTRY_PROJECT: connector-incident-management + - name: Check if connector in definitions yaml + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + connector="airbyte/${{ env.IMAGE_NAME }}" + definitionpath=./airbyte-config/init/src/main/resources/seed/ + sourcecheck=$(yq e ".. | select(has(\"dockerRepository\")) | select(.dockerRepository == \"$connector\")" "$definitionpath"source_definitions.yaml) + destcheck=$(yq e ".. | select(has(\"dockerRepository\")) | select(.dockerRepository == \"$connector\")" "$definitionpath"destination_definitions.yaml) + if [[ (-z "$sourcecheck" && -z "$destcheck") ]] + then exit 1 + fi + - name: Bump version in definitions yaml + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + connector="airbyte/${{ env.IMAGE_NAME }}" + definitionpath=./airbyte-config/init/src/main/resources/seed/ + sourcename=$(yq e ".[] | select(has(\"dockerRepository\")) | select(.dockerRepository == \"$connector\") | .name" "$definitionpath"source_definitions.yaml) + destname=$(yq e ".[] | select(has(\"dockerRepository\")) | select(.dockerRepository == \"$connector\") | .name" "$definitionpath"destination_definitions.yaml) + if [ -z "$sourcename" ] + then yq e "(.[] | select(.name == \"$destname\").dockerImageTag)|=\"${{ env.IMAGE_VERSION }}\"" -i "$definitionpath"destination_definitions.yaml + else yq e "(.[] | select(.name == \"$sourcename\").dockerImageTag)|=\"${{ env.IMAGE_VERSION }}\"" -i "$definitionpath"source_definitions.yaml + fi + - name: Run gradle process changes + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + ./gradlew :airbyte-config:init:processResources + - name: git config + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + git config user.name 'Octavia Squidington III' + git config user.email 'octavia-squidington-iii@users.noreply.github.com' + - name: git commit and push + if: github.event.inputs.auto-bump-version == 'true' && success() + run: | + git add -u + git commit -m "auto-bump connector version [ci skip]" + git pull origin ${{ github.event.inputs.gitref }} + git push origin ${{ github.event.inputs.gitref }} + id: auto-bump + - name: Process outcomes into emojis + if: ${{ always() && github.event.inputs.comment-id }} + run: | + if [[ ${{ steps.publish.outcome }} = "success" ]]; then + echo "PUBLISH_OUTCOME=:white_check_mark:" >> $GITHUB_ENV + else + echo "PUBLISH_OUTCOME=:x:" >> $GITHUB_ENV + fi + if [[ ${{ steps.auto-bump.outcome }} = "success" ]]; then + echo "AUTO_BUMP_OUTCOME=:white_check_mark:" >> $GITHUB_ENV + else + echo "AUTO_BUMP_OUTCOME=:x:" >> $GITHUB_ENV + fi + - name: Add connector outcome line to table + if: ${{ always() && github.event.inputs.comment-id }} uses: peter-evans/create-or-update-comment@v1 with: comment-id: ${{ github.event.inputs.comment-id }} body: | - > :white_check_mark: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} - - name: Add Failure Comment - if: github.event.inputs.comment-id && !success() + | ${{ matrix.connector }} | ${{ env.PUBLISH_OUTCOME }} | ${{ env.AUTO_BUMP_OUTCOME }} | + add-helpful-info-to-git-comment: + if: ${{ always() && github.event.inputs.comment-id }} + name: Add extra info to git comment + needs: + - start-publish-image-runner-0 # required to get output from the start-runner job + - publish-image # required to wait when the main job is done + runs-on: ubuntu-latest + steps: + - name: Add hint for manual seed definition update uses: peter-evans/create-or-update-comment@v1 with: comment-id: ${{ github.event.inputs.comment-id }} body: | - > :x: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} - - name: Slack Notification - Failure - if: failure() - uses: rtCamp/action-slack-notify@master - env: - SLACK_WEBHOOK: ${{ secrets.BUILD_SLACK_WEBHOOK }} - SLACK_USERNAME: Buildozer - SLACK_ICON: https://avatars.slack-edge.com/temp/2020-09-01/1342729352468_209b10acd6ff13a649a1.jpg - SLACK_COLOR: DC143C - SLACK_TITLE: "Failed to publish connector ${{ github.event.inputs.connector }} from branch ${{ github.ref }}" - SLACK_FOOTER: "" +
+ + if you have connectors that successfully published but failed definition generation, follow [step 4 here ▶️](https://docs.airbyte.com/connector-development/#publishing-a-connector) # In case of self-hosted EC2 errors, remove this block. - stop-publish-image-runner: + stop-publish-image-runner-0: + if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs name: Stop Build EC2 Runner needs: - - start-publish-image-runner # required to get output from the start-runner job + - start-publish-image-runner-0 # required to get output from the start-runner job + - preprocess-matrix - publish-image # required to wait when the main job is done + - find_valid_pat + - add-helpful-info-to-git-comment + runs-on: ubuntu-latest + steps: + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + aws-region: us-east-2 + - name: Stop EC2 runner + uses: airbytehq/ec2-github-runner@base64v1.1.0 + with: + mode: stop + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ needs.start-publish-image-runner-0.outputs.label }} + ec2-instance-id: ${{ needs.start-publish-image-runner-0.outputs.ec2-instance-id }} + stop-publish-image-runner-multi: + if: ${{ always() && github.event.inputs.parallel == 'true' }} + name: Stop Build EC2 Runner + needs: + - start-publish-image-runner-0 + - start-publish-image-runner-1 + - start-publish-image-runner-2 + - start-publish-image-runner-3 + - start-publish-image-runner-4 + - preprocess-matrix + - publish-image # required to wait when the main job is done + - find_valid_pat + strategy: + fail-fast: false + matrix: + ec2-instance: + [ + { + "label": "${{ needs.start-publish-image-runner-1.outputs.label }}", + "id": "${{ needs.start-publish-image-runner-1.outputs.ec2-instance-id }}", + }, + { + "label": "${{ needs.start-publish-image-runner-2.outputs.label }}", + "id": "${{ needs.start-publish-image-runner-2.outputs.ec2-instance-id }}", + }, + { + "label": "${{ needs.start-publish-image-runner-3.outputs.label }}", + "id": "${{ needs.start-publish-image-runner-3.outputs.ec2-instance-id }}", + }, + { + "label": "${{ needs.start-publish-image-runner-4.outputs.label }}", + "id": "${{ needs.start-publish-image-runner-4.outputs.ec2-instance-id }}", + }, + ] runs-on: ubuntu-latest - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v1 @@ -143,9 +451,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2 + uses: airbytehq/ec2-github-runner@base64v1.1.0 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} - label: ${{ needs.start-publish-image-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-publish-image-runner.outputs.ec2-instance-id }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + label: ${{ matrix.ec2-instance.label }} + ec2-instance-id: ${{ matrix.ec2-instance.id }} diff --git a/.github/workflows/publish-connector-command.yml b/.github/workflows/publish-connector-command.yml new file mode 100644 index 0000000000000..4aac116c8dbfb --- /dev/null +++ b/.github/workflows/publish-connector-command.yml @@ -0,0 +1,210 @@ +name: Publish Connector [EXPERIMENTAL] +on: + workflow_dispatch: + inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master + connector: + description: "Airbyte Connector" + required: true + bump-version: + description: "Set to major, minor, or patch to automatically bump connectors version in Dockerfile, definitions.yaml and generate seed spec. You can also do this manually" + required: false + default: "false" + run-tests: + description: "Should run tests" + required: false + default: "true" + comment-id: + description: "The comment-id of the slash command. Used to update the comment with the status." + required: false + +jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} + ## Gradle Build + # In case of self-hosted EC2 errors, remove this block. + +# start-bump-build-test-connector-runner: +# name: Start Build EC2 Runner +# runs-on: ubuntu-latest +# needs: find_valid_pat +# outputs: +# label: ${{ steps.start-ec2-runner.outputs.label }} +# ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} +# steps: +# - name: Checkout Airbyte +# uses: actions/checkout@v2 +# with: +# repository: ${{ github.event.inputs.repo }} +# ref: ${{ github.event.inputs.gitref }} +# - name: Start AWS Runner +# id: start-ec2-runner +# uses: ./.github/actions/start-aws-runner +# with: +# aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} +# aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} +# github-token: ${{ needs.find_valid_pat.outputs.pat }} +# # 80 gb disk +# ec2-image-id: ami-0d648081937c75a73 +# bump-build-test-connector: +# needs: start-bump-build-test-connector-runner +# runs-on: ${{ needs.start-bump-build-test-connector-runner.outputs.label }} +# environment: more-secrets +# steps: +# ############################ +# ## SET UP ## +# ############################ +# - name: Set up Cloud SDK +# uses: google-github-actions/setup-gcloud@v0 +# with: +# service_account_key: ${{ secrets.SPEC_CACHE_SERVICE_ACCOUNT_KEY }} +# export_default_credentials: true +# - name: Search for valid connector name format +# id: regex +# uses: AsasInnab/regex-action@v1 +# with: +# regex_pattern: "^(connectors|bases)/[a-zA-Z0-9-_]+$" +# regex_flags: "i" # required to be set for this plugin +# search_string: ${{ github.event.inputs.connector }} +# - name: Validate input workflow format +# if: steps.regex.outputs.first_match != github.event.inputs.connector +# run: echo "The connector provided has an invalid format!" && exit 1 +# - name: Link comment to workflow run +# if: github.event.inputs.comment-id +# uses: peter-evans/create-or-update-comment@v1 +# with: +# comment-id: ${{ github.event.inputs.comment-id }} +# body: | +# > :clock2: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} +# - name: Checkout Airbyte +# uses: actions/checkout@v2 +# with: +# repository: ${{ github.event.inputs.repo }} +# ref: ${{ github.event.inputs.gitref }} +# token: ${{ secrets.OCTAVIA_PAT }} +# - name: Install Java +# uses: actions/setup-java@v1 +# with: +# java-version: "17" +# - name: Install Python +# uses: actions/setup-python@v2 +# with: +# python-version: "3.9" +# - name: Install Pyenv and Tox +# run: | +# python3 -m pip install --quiet virtualenv==16.7.9 --user +# python3 -m virtualenv venv +# source venv/bin/activate +# pip install --quiet tox==3.24.4 +# - name: Install yq +# if: github.event.inputs.bump-version != 'false' && success() +# run: | +# sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys CC86BB64 +# sudo add-apt-repository ppa:rmescandon/yq +# sudo apt update +# sudo apt install yq -y +# - name: Test and install CI scripts +# # all CI python packages have the prefix "ci_" +# run: | +# source venv/bin/activate +# tox -r -c ./tools/tox_ci.ini +# pip install --quiet -e ./tools/ci_* +# - name: Get Credentials for ${{ github.event.inputs.connector }} +# run: | +# source venv/bin/activate +# ci_credentials ${{ github.event.inputs.connector }} +# env: +# GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} +# # TODO: seems like this should run in post-merge workflow +# # - name: Prepare Sentry +# # if: startsWith(github.event.inputs.connector, 'connectors') +# # run: | +# # curl -sL https://sentry.io/get-cli/ | bash +# # - name: Create Sentry Release +# # if: startsWith(github.event.inputs.connector, 'connectors') +# # run: | +# # sentry-cli releases set-commits "${{ env.IMAGE_NAME }}@${{ env.IMAGE_VERSION }}" --auto --ignore-missing +# # env: +# # SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_CONNECTOR_RELEASE_AUTH_TOKEN }} +# # SENTRY_ORG: airbyte-5j +# # SENTRY_PROJECT: airbyte-connectors +# # - name: Build and Test Success Comment +# # if: github.event.inputs.comment-id && success() +# # uses: peter-evans/create-or-update-comment@v1 +# # with: +# # comment-id: ${{ github.event.inputs.comment-id }} +# # body: | +# # > :rocket: Successfully built and tested ${{github.event.inputs.connector}} +# # - name: Build and Test Failure Comment +# # if: github.event.inputs.comment-id && !success() +# # uses: peter-evans/create-or-update-comment@v1 +# # with: +# # comment-id: ${{ github.event.inputs.comment-id }} +# # body: | +# # > :x: Failed to build and test ${{github.event.inputs.connector}} +# # - name: Slack Notification - Failure +# # if: failure() +# # uses: rtCamp/action-slack-notify@master +# # env: +# # SLACK_WEBHOOK: ${{ secrets.BUILD_SLACK_WEBHOOK }} +# # SLACK_USERNAME: Buildozer +# # SLACK_ICON: https://avatars.slack-edge.com/temp/2020-09-01/1342729352468_209b10acd6ff13a649a1.jpg +# # SLACK_COLOR: DC143C +# # SLACK_TITLE: "Failed to build and test connector ${{ github.event.inputs.connector }} from branch ${{ github.ref }}" +# # SLACK_FOOTER: "" +# # - name: Add Final Success Comment +# # if: github.event.inputs.comment-id && success() +# # uses: peter-evans/create-or-update-comment@v1 +# # with: +# # comment-id: ${{ github.event.inputs.comment-id }} +# # body: | +# # > :white_check_mark: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} +# # - name: Set publish label +# # if: success() +# # run: | +# # echo "set some label on PR" +# # In case of self-hosted EC2 errors, remove this block. +# stop-bump-build-test-connector-runner: +# name: Stop Build EC2 Runner +# needs: +# - start-bump-build-test-connector-runner # required to get output from the start-runner job +# - bump-build-test-connector # required to wait when the main job is done +# - find_valid_pat +# runs-on: ubuntu-latest +# if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs +# steps: +# - name: Configure AWS credentials +# uses: aws-actions/configure-aws-credentials@v1 +# with: +# aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} +# aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} +# aws-region: us-east-2 +# - name: Stop EC2 runner +# uses: supertopher/ec2-github-runner@base64v1.0.10 +# with: +# mode: stop +# github-token: ${{ needs.find_valid_pat.outputs.pat }} +# label: ${{ needs.start-bump-build-test-connector-runner.outputs.label }} +# ec2-instance-id: ${{ needs.start-bump-build-test-connector-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/publish-external-command.yml b/.github/workflows/publish-external-command.yml index 2ed44926a7182..ff85f1c147b29 100644 --- a/.github/workflows/publish-external-command.yml +++ b/.github/workflows/publish-external-command.yml @@ -11,12 +11,38 @@ on: comment-id: description: "The comment-id of the slash command. Used to update the comment with the status." required: false + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} ## Gradle Build # In case of self-hosted EC2 errors, remove this block. start-publish-image-runner: name: Start Build EC2 Runner + needs: find_valid_pat runs-on: ubuntu-latest outputs: label: ${{ steps.start-ec2-runner.outputs.label }} @@ -25,14 +51,15 @@ jobs: - name: Checkout Airbyte uses: actions/checkout@v2 with: - repository: ${{github.event.pull_request.head.repo.full_name}} # always use the branch's repository + repository: ${{ gituhb.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Start AWS Runner id: start-ec2-runner uses: ./.github/actions/start-aws-runner with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} # 80 gb disk ec2-image-id: ami-0d648081937c75a73 publish-image: @@ -41,7 +68,7 @@ jobs: environment: more-secrets steps: - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@master + uses: google-github-actions/setup-gcloud@v0 with: service_account_key: ${{ secrets.SPEC_CACHE_SERVICE_ACCOUNT_KEY }} export_default_credentials: true @@ -55,14 +82,16 @@ jobs: - name: Checkout Airbyte uses: actions/checkout@v2 with: - repository: ${{github.event.pull_request.head.repo.full_name}} # always use the branch's repository + repository: ${{ gituhb.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - run: | - echo "$SPEC_CACHE_SERVICE_ACCOUNT_KEY" > spec_cache_key_file.json && docker login -u airbytebot -p ${DOCKER_PASSWORD} + echo "$SPEC_CACHE_SERVICE_ACCOUNT_KEY" > spec_cache_key_file.json && docker login -u ${DOCKER_HUB_USERNAME} -p ${DOCKER_HUB_PASSWORD} ./tools/integrations/manage.sh publish_external ${{ github.event.inputs.connector }} ${{ github.event.inputs.version }} name: publish ${{ github.event.inputs.connector }} id: publish env: - DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} # Oracle expects this variable to be set. Although usually present, this is not set by default on Github virtual runners. TZ: UTC - name: Add Success Comment @@ -95,6 +124,7 @@ jobs: needs: - start-publish-image-runner # required to get output from the start-runner job - publish-image # required to wait when the main job is done + - find_valid_pat runs-on: ubuntu-latest if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: @@ -105,9 +135,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2 + uses: supertopher/ec2-github-runner@base64v1.0.10 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} label: ${{ needs.start-publish-image-runner.outputs.label }} ec2-instance-id: ${{ needs.start-publish-image-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/publish-helm-charts.yml b/.github/workflows/publish-helm-charts.yml new file mode 100644 index 0000000000000..f2b03415c5637 --- /dev/null +++ b/.github/workflows/publish-helm-charts.yml @@ -0,0 +1,91 @@ +name: Publish Helm OSS charts +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + +on: + push: + branches: + - 'master' + paths: + - 'charts/**' + workflow_dispatch: + +jobs: + generate-semantic-version: + name: Generate semantic version + runs-on: ubuntu-22.04 + outputs: + next-version: ${{ steps.sem-ver.outputs.version }} + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - uses: paulhatch/semantic-version@v4.0.3 + id: sem-ver + with: + tag_prefix: "v" + major_pattern: "feat:" + minor_pattern: "fix:" + format: "${major}.${minor}.${patch}" + change_path: "./charts" + bump_each_commit: true + + release-chart: + name: Chart release + runs-on: ubuntu-22.04 + needs: ["generate-semantic-version"] + steps: + - uses: actions/checkout@v3 + with: + path: "airbyte" + fetch-depth: 0 + + - uses: actions/checkout@v3 + with: + repository: "airbytehq/helm-charts" + token: ${{ secrets.OCTAVIA_PAT }} + path: "airbyte-oss" + + - name: Replace semantic version in main chart for deps + shell: bash + working-directory: ./airbyte/charts + run: | + sed -i "s/ version: placeholder/ version: ${{ needs.generate-semantic-version.outputs.next-version }}/g" airbyte/Chart.yaml + + - name: "Helm package" + shell: bash + run: | + declare -a StringArray=("airbyte-bootloader" "airbyte-server" "airbyte-temporal" "airbyte-webapp" "airbyte-pod-sweeper" "airbyte-worker") + for val in ${StringArray[@]}; do + cd ./airbyte/charts/${val} && helm dep update && cd $GITHUB_WORKSPACE + helm package ./airbyte/charts/${val} -d airbyte-oss --version ${{ needs.generate-semantic-version.outputs.next-version }} + done + helm repo index airbyte-oss/ + + - name: Commit and push changes + uses: EndBug/add-and-commit@v9 + with: + message: 'Bump release to ${{ needs.generate-semantic-version.outputs.next-version }}' + add: '.' + cwd: './airbyte-oss/' + + - name: "Helm package main chart" + shell: bash + run: | + echo "Waiting for published charts to be synced in helm-charts repo" + sleep 300 + declare -a StringArray=("airbyte") + for val in ${StringArray[@]}; do + cd ./airbyte/charts/${val} && cat Chart.yaml && helm dep update && cd $GITHUB_WORKSPACE + helm package ./airbyte/charts/${val} -d airbyte-oss --version ${{ needs.generate-semantic-version.outputs.next-version }} + done + helm repo index airbyte-oss/ + + - name: Commit and push changes + uses: EndBug/add-and-commit@v9 + with: + message: 'Bump release to ${{ needs.generate-semantic-version.outputs.next-version }}' + add: '.' + cwd: './airbyte-oss/' + diff --git a/.github/workflows/publish-oss-for-cloud.yml b/.github/workflows/publish-oss-for-cloud.yml new file mode 100644 index 0000000000000..e7acb72f43120 --- /dev/null +++ b/.github/workflows/publish-oss-for-cloud.yml @@ -0,0 +1,145 @@ +name: Publish OSS Artifacts for Cloud +concurrency: + group: ${{ github.workflow }}-${{ inputs.oss_ref || github.sha }} + +on: + workflow_dispatch: + inputs: + oss_ref: + description: "Publish artifacts for the following git ref (if unspecified, uses the latest commit for the current branch):" + required: false +jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} + start-runner: + name: "Start Runner on AWS" + needs: find_valid_pat + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Start AWS Runner + id: start-ec2-runner + uses: ./.github/actions/start-aws-runner + with: + aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} + + generate-tags: + name: "Generate Dev and Master Tags" + runs-on: ubuntu-latest + outputs: + dev_tag: ${{ steps.set-outputs.outputs.dev_tag }} + master_tag: ${{ steps.set-outputs.outputs.master_tag }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + ref: ${{ github.event.inputs.oss_ref || github.ref }} + - name: Generate Outputs + id: set-outputs + shell: bash + run: |- + set -x + + commit_sha=$(git rev-parse --short HEAD) + + # set dev_tag + # AirbyteVersion.java allows versions that have a prefix of 'dev' + echo "::set-output name=dev_tag::dev-${commit_sha}" + + # If this commit is on the master branch, also set master_tag + if $(git merge-base --is-ancestor "${commit_sha}" master); then + echo "::set-output name=master_tag::${commit_sha}" + fi + + oss-branch-build: + name: "Gradle Build and Publish" + needs: + - start-runner + - generate-tags + runs-on: ${{ needs.start-runner.outputs.label }} + environment: more-secrets + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + ref: ${{ github.event.inputs.oss_ref || github.ref }} + + - name: Build Branch + uses: ./.github/actions/build-branch + with: + branch_version_tag: ${{ needs.generate-tags.outputs.dev_tag }} + + - name: Publish Dev Jars + env: + CLOUDREPO_USER: ${{ secrets.CLOUDREPO_USER }} + CLOUDREPO_PASSWORD: ${{ secrets.CLOUDREPO_PASSWORD }} + run: VERSION=${{ needs.generate-tags.outputs.dev_tag }} SUB_BUILD=PLATFORM ./gradlew publish + shell: bash + + - name: Publish Master Jars + if: needs.generate-tags.outputs.master_tag != '' + env: + CLOUDREPO_USER: ${{ secrets.CLOUDREPO_USER }} + CLOUDREPO_PASSWORD: ${{ secrets.CLOUDREPO_PASSWORD }} + run: VERSION=${{ needs.generate-tags.outputs.master_tag }} SUB_BUILD=PLATFORM ./gradlew publish + shell: bash + + docker-push: + name: "Push Docker Images" + needs: + - start-runner + - generate-tags + - oss-branch-build + runs-on: ${{ needs.start-runner.outputs.label }} + steps: + - name: Login to Docker (on Master) + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_PASSWORD }} + + - name: Prepare Docker buildx + run: | + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + docker buildx create --name oss-buildx --driver docker-container --use + shell: bash + + - name: Set Git Revision + run: | + GIT_REVISION=$(git rev-parse HEAD) + [ [ -z "$GIT_REVISION" ] ] && echo "Couldn't get the git revision..." && exit 1 + echo "GIT_REVISION=${GIT_REVISION}" >> $GITHUB_ENV + shell: bash + + - name: Push Docker Images + env: + VERSION: ${{ needs.generate-tags.outputs.dev_tag }} + ALT_TAG: ${{ needs.generate-tags.outputs.master_tag }} + run: GIT_REVISION=$GIT_REVISION docker buildx bake -f docker-compose-cloud.buildx.yaml --push + shell: bash + + - name: Cleanup Docker buildx + run: docker buildx rm oss-buildx + shell: bash diff --git a/.github/workflows/release-airbyte-os.yml b/.github/workflows/release-airbyte-os.yml index c7ccc2d7c6ece..14a284cbfe523 100644 --- a/.github/workflows/release-airbyte-os.yml +++ b/.github/workflows/release-airbyte-os.yml @@ -9,9 +9,27 @@ on: required: true default: "patch" jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} # In case of self-hosted EC2 errors, remove this block. start-release-airbyte-runner: name: "Release Airbyte: Start EC2 Runner" + needs: find_valid_pat timeout-minutes: 10 runs-on: ubuntu-latest outputs: @@ -26,7 +44,7 @@ jobs: with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} releaseAirbyte: # In case of self-hosted EC2 errors, removed the `needs` line and switch back to running on ubuntu-latest. @@ -42,35 +60,78 @@ jobs: with: java-version: "17" - - uses: actions/setup-node@v1 + - uses: actions/setup-node@v2 with: - node-version: "16.13.0" - + node-version: "lts/gallium" # necessary to install pip - uses: actions/setup-python@v2 with: - python-version: "3.7" - - name: Save Old Version - id: old_version - run: | - echo ::set-output name=OLD_VERSION::$(grep -w VERSION .env | cut -d"=" -f2) + python-version: "3.9" - name: Release Airbyte id: release_airbyte env: - DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} PART_TO_BUMP: ${{ github.event.inputs.partToBump }} CLOUDREPO_USER: ${{ secrets.CLOUDREPO_USER }} CLOUDREPO_PASSWORD: ${{ secrets.CLOUDREPO_PASSWORD }} run: | ./tools/bin/release_version.sh - - name: Save New Version - id: new_version - run: | - echo ::set-output name=NEW_VERSION::$(grep -w VERSION .env | cut -d"=" -f2) + + # We are releasing octavia from a separate job because: + # - The self hosted runner used in releaseAirbyte does not have the docker buildx command to build multi-arch images + releaseOctavia: + runs-on: ubuntu-latest + environment: more-secrets + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - uses: actions/setup-java@v1 + with: + java-version: "17" + + - uses: actions/setup-node@v2 + with: + node-version: "lts/gallium" + - uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Release Octavia + id: release_octavia + env: + DOCKER_HUB_USERNAME: ${{ secrets.DOCKER_HUB_USERNAME }} + DOCKER_HUB_PASSWORD: ${{ secrets.DOCKER_HUB_PASSWORD }} + PART_TO_BUMP: ${{ github.event.inputs.partToBump }} + run: ./tools/bin/release_version_octavia.sh + + createPullRequest: + needs: + - releaseAirbyte + - releaseOctavia + runs-on: ubuntu-latest + environment: more-secrets + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + # necessary to install pip + - uses: actions/setup-python@v2 + with: + python-version: "3.9" + - name: Bump version + id: bump_version + env: + PART_TO_BUMP: ${{ github.event.inputs.partToBump }} + run: ./tools/bin/bump_version.sh - name: Get PR Body id: pr_body env: - PREV_VERSION: ${{ steps.old_version.outputs.OLD_VERSION }} + PREV_VERSION: ${{ steps.bump_version.outputs.PREV_VERSION }} + NEW_VERSION: ${{ steps.bump_version.outputs.NEW_VERSION }} + GIT_REVISION: ${{ steps.bump_version.outputs.GIT_REVISION }} run: | chmod +x tools/bin/pr_body.sh body=$(./tools/bin/pr_body.sh) @@ -84,9 +145,9 @@ jobs: branch: bump-version branch-suffix: random delete-branch: true - title: Bump Airbyte version from ${{ steps.old_version.outputs.OLD_VERSION }} to ${{ steps.new_version.outputs.NEW_VERSION }} + title: Bump Airbyte version from ${{ steps.bump_version.outputs.PREV_VERSION }} to ${{ steps.bump_version.outputs.NEW_VERSION }} body: ${{ steps.pr_body.outputs.PR_BODY }} - commit-message: Bump Airbyte version from ${{ steps.old_version.outputs.OLD_VERSION }} to ${{ steps.new_version.outputs.NEW_VERSION }} + commit-message: Bump Airbyte version from ${{ steps.bump_version.outputs.PREV_VERSION }} to ${{ steps.bump_version.outputs.NEW_VERSION }} - name: PR Details run: | echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}" @@ -99,6 +160,7 @@ jobs: needs: - start-release-airbyte-runner # required to get output from the start-runner job - releaseAirbyte # required to wait when the main job is done + - find_valid_pat runs-on: ubuntu-latest if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: @@ -109,9 +171,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2.3.0 + uses: supertopher/ec2-github-runner@base64v1.0.10 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} label: ${{ needs.start-release-airbyte-runner.outputs.label }} ec2-instance-id: ${{ needs.start-release-airbyte-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/run-specific-test-command.yml b/.github/workflows/run-specific-test-command.yml new file mode 100644 index 0000000000000..6d43e57a7cd5c --- /dev/null +++ b/.github/workflows/run-specific-test-command.yml @@ -0,0 +1,59 @@ +name: performance-test +on: + workflow_dispatch: + inputs: + repo: + description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." + required: false + default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master + test-name: + description: "Test to run classname" + required: true + +jobs: + single-test-runner: + timeout-minutes: 300 + needs: start-platform-build-runner # required to start the main job when the runner is ready + runs-on: ${{ needs.start-platform-build-runner.outputs.label }} # run the job on the newly created runner + environment: more-secrets + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + with: + repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} + + - name: Npm Caching + uses: actions/cache@v2 + with: + path: | + ~/.npm + key: ${{ secrets.CACHE_VERSION }}-npm-${{ runner.os }}-${{ hashFiles('**/package-lock.json') }} + restore-keys: | + ${{ secrets.CACHE_VERSION }}-npm-${{ runner.os }}- + + # this intentionally does not use restore-keys so we don't mess with gradle caching + - name: Gradle Caching + uses: actions/cache@v2 + with: + path: | + ~/.gradle/caches + ~/.gradle/wrapper + **/.venv + key: ${{ secrets.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('**/*.gradle*') }}-${{ hashFiles('**/package-lock.json') }} + + - uses: actions/setup-java@v1 + with: + java-version: '14' + + - uses: actions/setup-node@v2 + with: + node-version: 'lts/gallium' + + - name: Build + id: run-specific-test + run: ./gradlew allTest --tests *${{ github.event.inputs.test-name }} diff --git a/.github/workflows/run-specific-test.yml b/.github/workflows/run-specific-test.yml deleted file mode 100644 index ad31c77ea549f..0000000000000 --- a/.github/workflows/run-specific-test.yml +++ /dev/null @@ -1,48 +0,0 @@ -name: performance-test -on: - workflow_dispatch: - inputs: - test-name: - description: "Test to run classname" - required: true - -jobs: - single-test-runner: - timeout-minutes: 300 - needs: start-platform-build-runner # required to start the main job when the runner is ready - runs-on: ${{ needs.start-platform-build-runner.outputs.label }} # run the job on the newly created runner - environment: more-secrets - steps: - - name: Checkout Airbyte - uses: actions/checkout@v2 - - - name: Npm Caching - uses: actions/cache@v2 - with: - path: | - ~/.npm - key: ${{ secrets.CACHE_VERSION }}-npm-${{ runner.os }}-${{ hashFiles('**/package-lock.json') }} - restore-keys: | - ${{ secrets.CACHE_VERSION }}-npm-${{ runner.os }}- - - # this intentionally does not use restore-keys so we don't mess with gradle caching - - name: Gradle Caching - uses: actions/cache@v2 - with: - path: | - ~/.gradle/caches - ~/.gradle/wrapper - **/.venv - key: ${{ secrets.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('**/*.gradle*') }}-${{ hashFiles('**/package-lock.json') }} - - - uses: actions/setup-java@v1 - with: - java-version: '14' - - - uses: actions/setup-node@v1 - with: - node-version: '16.13.0' - - - name: Build - id: run-specific-test - run: ./gradlew allTest --tests *${{ github.event.inputs.test-name }} diff --git a/.github/workflows/shared-issues.yml b/.github/workflows/shared-issues.yml new file mode 100644 index 0000000000000..03bbb8eb0554f --- /dev/null +++ b/.github/workflows/shared-issues.yml @@ -0,0 +1,16 @@ +name: "Shared Issues" +on: + issues: + types: [opened, labeled, unlabeled] + +jobs: + shared-issues: + runs-on: ubuntu-latest + steps: + - uses: nick-fields/private-action-loader@v3 + with: + pal-repo-token: "${{ secrets.OCTAVIA_PAT }}" + pal-repo-name: airbytehq/workflow-actions@production + # the following input gets passed to the private action + token: "${{ secrets.OCTAVIA_PAT }}" + command: "issue" diff --git a/.github/workflows/shared-pulls.yml b/.github/workflows/shared-pulls.yml new file mode 100644 index 0000000000000..df9c11d6b49ac --- /dev/null +++ b/.github/workflows/shared-pulls.yml @@ -0,0 +1,38 @@ +name: "Shared Pull Requests" +on: + pull_request_target: + types: [opened, labeled, unlabeled, ready_for_review, synchronize, reopened] + +jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.OCTAVIA_PAT }} \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} + + shared-pulls: + name: "Label github issues for tracking" + needs: + - find_valid_pat + runs-on: ubuntu-latest + steps: + - uses: nick-fields/private-action-loader@v3 + with: + pal-repo-token: ${{ needs.find_valid_pat.outputs.pat }} + pal-repo-name: airbytehq/workflow-actions@production + # the following input gets passed to the private action + token: ${{ needs.find_valid_pat.outputs.pat }} + command: "pull" diff --git a/.github/workflows/slash-commands.yml b/.github/workflows/slash-commands.yml index 7a11ce6a5de3b..1c2aed0b941e1 100644 --- a/.github/workflows/slash-commands.yml +++ b/.github/workflows/slash-commands.yml @@ -4,27 +4,34 @@ on: types: [created] jobs: slashCommandDispatch: + # Only allow slash commands on pull request (not on issues) + if: ${{ github.event.issue.pull_request }} runs-on: ubuntu-latest steps: - - name: Get PR Ref + - name: Get PR repo and ref id: getref run: | - echo "::set-output name=ref::$(curl ${{ github.event.issue.pull_request.url }} | jq -r '.head.ref')" + pr_info="$(curl ${{ github.event.issue.pull_request.url }})" + echo ::set-output name=ref::"$(echo $pr_info | jq -r '.head.ref')" + echo ::set-output name=repo::"$(echo $pr_info | jq -r '.head.repo.full_name')" - name: Slash Command Dispatch id: scd uses: peter-evans/slash-command-dispatch@v2 with: - token: ${{ secrets.SLASH_COMMAND_PAT }} + token: ${{ secrets.DAVINCHIA_PAT }} commands: | test test-performance + build-connector + publish-connector publish publish-external publish-cdk gke-kube-test run-specific-test static-args: | - ref=${{ steps.getref.outputs.ref }} + repo=${{ steps.getref.outputs.repo }} + gitref=${{ steps.getref.outputs.ref }} comment-id=${{ github.event.comment.id }} dispatch-type: workflow - name: Edit comment with error message diff --git a/.github/workflows/sonar-scan.yml b/.github/workflows/sonar-scan.yml index 7c1f40ec216cf..1320d7ac21d21 100644 --- a/.github/workflows/sonar-scan.yml +++ b/.github/workflows/sonar-scan.yml @@ -4,7 +4,6 @@ on: types: [opened, synchronize, closed] jobs: - detect-changes: name: Detect Changed Modules timeout-minutes: 5 @@ -23,11 +22,11 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.7 - - name: Intall Requirements + python-version: 3.9 + - name: Install Requirements run: | - pip install ./tools/ci_common_utils ./tools/ci_code_validator[tests] - pytest ./tools/ci_code_validator + pip install ./tools/ci_common_utils ./tools/ci_code_validator[tests] + pytest ./tools/ci_code_validator - name: Detect Changed Modules id: detect-changed-modules run: | @@ -36,9 +35,10 @@ jobs: echo "::set-output name=changed-modules::{ \"include\": $CHANGES }" run-ci-tests: - if: github.event.pull_request.draft == false + # Do only run if the PR is not a draft and the changed modules matrix contains at least one entry + if: github.event.pull_request.draft == false && fromJson(needs.detect-changes.outputs.changed-modules).include[0] != null needs: detect-changes - name: Tests for ${{ matrix.module }} + name: Tests for ${{ matrix.module }} runs-on: ubuntu-latest strategy: @@ -49,29 +49,25 @@ jobs: MODULE_FOLDER: ${{ matrix.folder }} ENV_NAME: "github" - steps: - - name: Print Settings - run: | - echo "Module: ${{ env.MODULE_NAME }}, Lang: ${{ env.MODULE_LANG }}, Folder: ${{ env.MODULE_FOLDER }}" - - name: Checkout Airbyte - if: ${{ env.ENV_NAME == 'github' }} - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Run Tests Runner - id: run-python-tests - uses: ./.github/actions/ci-tests-runner - with: - token: ${{ secrets.GITHUB_TOKEN }} - module-name: ${{ env.MODULE_NAME }} - module-folder: ${{ env.MODULE_FOLDER }} - module-lang: ${{ env.MODULE_LANG }} - sonar-token: ${{ secrets.SONAR_TOKEN }} - sonar-gcp-access-key: ${{ secrets.GCP_SONAR_SA_KEY }} - pull-request-id: "${{ github.repository }}/${{ github.event.pull_request.number }}" - remove-sonar-project: ${{ github.event_name == 'pull_request' && github.event.action == 'closed' }} - - + - name: Print Settings + run: | + echo "Module: ${{ env.MODULE_NAME }}, Lang: ${{ env.MODULE_LANG }}, Folder: ${{ env.MODULE_FOLDER }}" + - name: Checkout Airbyte + if: ${{ env.ENV_NAME == 'github' }} + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: Run Tests Runner + id: run-python-tests + uses: ./.github/actions/ci-tests-runner + with: + token: ${{ secrets.GITHUB_TOKEN }} + module-name: ${{ env.MODULE_NAME }} + module-folder: ${{ env.MODULE_FOLDER }} + module-lang: ${{ env.MODULE_LANG }} + sonar-token: ${{ secrets.SONAR_TOKEN }} + sonar-gcp-access-key: ${{ secrets.GCP_SONAR_SA_KEY }} + pull-request-id: "${{ github.repository }}/${{ github.event.pull_request.number }}" + remove-sonar-project: true diff --git a/.github/workflows/sync-branches.yml b/.github/workflows/sync-branches.yml deleted file mode 100644 index 65aad5bc4f576..0000000000000 --- a/.github/workflows/sync-branches.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Sync Docs Branch -on: - push: - branches: - - master - -jobs: - repo-sync: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - with: - persist-credentials: false - - name: repo-sync - uses: repo-sync/github-sync@v2 - with: - source_repo: "airbytehq/airbyte" - source_branch: "master" - destination_branch: "gitbook/v1" - github_token: ${{ secrets.SLASH_COMMAND_PAT }} diff --git a/.github/workflows/terminate-zombie-build-instances.yml b/.github/workflows/terminate-zombie-build-instances.yml new file mode 100644 index 0000000000000..8de735476c813 --- /dev/null +++ b/.github/workflows/terminate-zombie-build-instances.yml @@ -0,0 +1,48 @@ +# Required since we cannot guarantee instances are always terminated. +# Also a failsafe against a dev writing a workflow that does not terminate build instances. +# Though the average Airbyte build runtime as of this commit is ~20 mins, connector builds +# can take up to 3 hours. Set this to 3 hours to include these longer runs. +name: Terminate Zombie Build Instances + +on: + workflow_dispatch: + schedule: + - cron: "0 */1 * * *" + +jobs: + terminate: + runs-on: ubuntu-latest + steps: + - name: List and Terminate Instances Older Than 4 Hours + env: + AWS_ACCESS_KEY_ID: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} + # See https://github.com/aws/aws-cli/issues/5623 + AWS_EC2_METADATA_DISABLED: true + run: | + set -euxo pipefail + + export TIME_LIMIT=14400 # 4 hours + + aws configure set default.region us-east-2 + + # See https://awscli.amazonaws.com/v2/documentation/api/latest/reference/ec2/describe-instances.html for describe command. + # Since the AWS cli returns an ISO HH:MM timestamp, and Jq only accepts Z timestamps, we define a function toZ to convert this. + export to_terminate=$(aws ec2 describe-instances --no-paginate --filters Name=instance-type,Values=c5.2xlarge Name=instance-state-name,Values=running \ + --query 'Reservations[*].Instances[*].{Instance:InstanceId,LaunchTime:LaunchTime}' --output json \ + | jq 'def toZ(str): str | (split("+")[0] + "Z") | fromdate ; + flatten | map( { InstanceId: .Instance, LaunchTime: toZ(.LaunchTime) } ) | map( select ( .LaunchTime < (now - (env.TIME_LIMIT|tonumber)) ) )') + + echo "MARKED FOR TERMINATION: ${to_terminate}" + + # See https://docs.aws.amazon.com/cli/latest/reference/ec2/terminate-instances.html for terminate command. + echo $to_terminate | jq '.[] | .InstanceId' | xargs --no-run-if-empty --max-args=1 aws ec2 terminate-instances --instance-ids + terminate-github-instances: + runs-on: ubuntu-latest + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: List and Terminate GH actions in status 'offline' + env: + GITHUB_PAT: ${{ secrets.OCTAVIA_PAT }} + run: ./tools/bin/gh_action_zombie_killer diff --git a/.github/workflows/test-command.yml b/.github/workflows/test-command.yml index 363e72851248d..ca948abfc0fa1 100644 --- a/.github/workflows/test-command.yml +++ b/.github/workflows/test-command.yml @@ -9,13 +9,40 @@ on: description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." required: false default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master comment-id: description: "The comment-id of the slash command. Used to update the comment with the status." required: false + uuid: + description: "Custom UUID of workflow run. Used because GitHub dispatches endpoint does not return workflow run id." + required: false jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: UUID ${{ github.event.inputs.uuid }} + run: true + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} start-test-runner: name: Start Build EC2 Runner + needs: find_valid_pat timeout-minutes: 10 runs-on: ubuntu-latest outputs: @@ -26,13 +53,14 @@ jobs: uses: actions/checkout@v2 with: repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Start AWS Runner id: start-ec2-runner uses: ./.github/actions/start-aws-runner with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} # 80 gb disk ec2-image-id: ami-0d648081937c75a73 integration-test: @@ -41,6 +69,13 @@ jobs: runs-on: ${{ needs.start-test-runner.outputs.label }} environment: more-secrets steps: + - name: Link comment to workflow run + if: github.event.inputs.comment-id + uses: peter-evans/create-or-update-comment@v1 + with: + comment-id: ${{ github.event.inputs.comment-id }} + body: | + > :clock2: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} - name: Search for valid connector name format id: regex uses: AsasInnab/regex-action@v1 @@ -51,27 +86,20 @@ jobs: - name: Validate input workflow format if: steps.regex.outputs.first_match != github.event.inputs.connector run: echo "The connector provided has an invalid format!" && exit 1 - - name: Link comment to workflow run - if: github.event.inputs.comment-id - uses: peter-evans/create-or-update-comment@v1 - with: - comment-id: ${{ github.event.inputs.comment-id }} - body: | - > :clock2: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} - name: Checkout Airbyte uses: actions/checkout@v2 with: repository: ${{ github.event.inputs.repo }} - - name: Install Unzip for Databricks - if: github.event.inputs.connector == 'connectors/destination-databricks' - run: | - apt-get update && apt-get install -y unzip + ref: ${{ github.event.inputs.gitref }} - name: Install Java uses: actions/setup-java@v1 with: - java-version: '17' + java-version: "17" + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: "3.9" - name: Install Pyenv and Tox - # Beside PyEnv, this does not set any runtimes up because it uses an AMI image that has everything pre-installed. See https://github.com/airbytehq/airbyte/issues/4559/ run: | python3 -m pip install --quiet virtualenv==16.7.9 --user python3 -m virtualenv venv @@ -87,10 +115,16 @@ jobs: run: | source venv/bin/activate ci_credentials ${{ github.event.inputs.connector }} + # normalization also runs destination-specific tests, so fetch their creds also + if [ 'bases/base-normalization' = "${{ github.event.inputs.connector }}" ] || [ 'base-normalization' = "${{ github.event.inputs.connector }}" ]; then + ci_credentials destination-bigquery + ci_credentials destination-postgres + ci_credentials destination-snowflake + fi env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - - name: test ${{ github.event.inputs.connector }} + - name: Test ${{ github.event.inputs.connector }} run: | ./tools/bin/ci_integration_test.sh ${{ github.event.inputs.connector }} id: test @@ -136,7 +170,7 @@ jobs: body: | > :white_check_mark: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} ${{env.PYTHON_UNITTEST_COVERAGE_REPORT}} - > ${{env.PYTHON_SHORT_TEST_SUMMARY_INFO}} + > ${{env.TEST_SUMMARY_INFO}} - name: Add Failure Comment if: github.event.inputs.comment-id && failure() uses: peter-evans/create-or-update-comment@v1 @@ -145,7 +179,7 @@ jobs: body: | > :x: ${{github.event.inputs.connector}} https://github.com/${{github.repository}}/actions/runs/${{github.run_id}} > :bug: ${{env.GRADLE_SCAN_LINK}} - > ${{env.PYTHON_SHORT_TEST_SUMMARY_INFO}} + > ${{env.TEST_SUMMARY_INFO}} # In case of self-hosted EC2 errors, remove this block. stop-test-runner: name: Stop Build EC2 Runner @@ -153,6 +187,7 @@ jobs: needs: - start-test-runner # required to get output from the start-runner job - integration-test # required to wait when the main job is done + - find_valid_pat runs-on: ubuntu-latest if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: @@ -163,9 +198,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2 + uses: supertopher/ec2-github-runner@base64v1.0.10 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} label: ${{ needs.start-test-runner.outputs.label }} ec2-instance-id: ${{ needs.start-test-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/test-performance-command.yml b/.github/workflows/test-performance-command.yml index a3e0dfbce070c..3cc805a8d30b7 100644 --- a/.github/workflows/test-performance-command.yml +++ b/.github/workflows/test-performance-command.yml @@ -9,6 +9,10 @@ on: description: "Repo to check out code from. Defaults to the main airbyte repo. Set this when building connectors from forked repos." required: false default: "airbytehq/airbyte" + gitref: + description: "The git ref to check out from the specified repository." + required: false + default: master comment-id: description: "The comment-id of the slash command. Used to update the comment with the status." required: false @@ -20,8 +24,26 @@ on: required: false jobs: + find_valid_pat: + name: "Find a PAT with room for actions" + timeout-minutes: 10 + runs-on: ubuntu-latest + outputs: + pat: ${{ steps.variables.outputs.pat }} + steps: + - name: Checkout Airbyte + uses: actions/checkout@v2 + - name: Check PAT rate limits + id: variables + run: | + ./tools/bin/find_non_rate_limited_PAT \ + ${{ secrets.AIRBYTEIO_PAT }} \ + ${{ secrets.OSS_BUILD_RUNNER_GITHUB_PAT }} \ + ${{ secrets.SUPERTOPHER_PAT }} \ + ${{ secrets.DAVINCHIA_PAT }} start-test-runner: name: Start Build EC2 Runner + needs: find_valid_pat timeout-minutes: 10 runs-on: ubuntu-latest outputs: @@ -32,13 +54,14 @@ jobs: uses: actions/checkout@v2 with: repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Start AWS Runner id: start-ec2-runner uses: ./.github/actions/start-aws-runner with: aws-access-key-id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} # 80 gb disk ec2-image-id: ami-0d648081937c75a73 performance-test: @@ -68,12 +91,16 @@ jobs: uses: actions/checkout@v2 with: repository: ${{ github.event.inputs.repo }} + ref: ${{ github.event.inputs.gitref }} - name: Install Java uses: actions/setup-java@v1 with: - java-version: '17' + java-version: "17" + - name: Install Python + uses: actions/setup-python@v2 + with: + python-version: "3.9" - name: Install Pyenv and Tox - # Beside PyEnv, this does not set any runtimes up because it uses an AMI image that has everything pre-installed. See https://github.com/airbytehq/airbyte/issues/4559/ run: | python3 -m pip install --quiet virtualenv==16.7.9 --user python3 -m virtualenv venv @@ -89,6 +116,12 @@ jobs: run: | source venv/bin/activate ci_credentials ${{ github.event.inputs.connector }} + # normalization also runs destination-specific tests, so fetch their creds also + if [ 'bases/base-normalization' = "${{ github.event.inputs.connector }}" ] || [ 'base-normalization' = "${{ github.event.inputs.connector }}" ]; then + ci_credentials destination-bigquery + ci_credentials destination-postgres + ci_credentials destination-snowflake + fi env: GCP_GSM_CREDENTIALS: ${{ secrets.GCP_GSM_CREDENTIALS }} - run: | @@ -152,6 +185,7 @@ jobs: needs: - start-test-runner # required to get output from the start-runner job - performance-test # required to wait when the main job is done + - find_valid_pat runs-on: ubuntu-latest if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs steps: @@ -162,9 +196,9 @@ jobs: aws-secret-access-key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} aws-region: us-east-2 - name: Stop EC2 runner - uses: machulav/ec2-github-runner@v2 + uses: supertopher/ec2-github-runner@base64v1.0.10 with: mode: stop - github-token: ${{ secrets.SELF_RUNNER_GITHUB_ACCESS_TOKEN }} + github-token: ${{ needs.find_valid_pat.outputs.pat }} label: ${{ needs.start-test-runner.outputs.label }} ec2-instance-id: ${{ needs.start-test-runner.outputs.ec2-instance-id }} diff --git a/.gitignore b/.gitignore index 9737f1231509e..4027224bf7c8f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ .gradle .idea -.vscode *.iml *.swp build @@ -11,9 +10,9 @@ data .classpath .project .settings +.vscode **/gmon.out static_checker_reports/ -.vscode # Logs acceptance_tests_logs/ @@ -65,3 +64,6 @@ resources/examples/airflow/logs/* # Cloud Demo !airbyte-webapp/src/packages/cloud/data + +# Summary.md keeps getting added and we just don't like it +docs/SUMMARY.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e62b52145420f..3487659101c89 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,15 +7,22 @@ repos: hooks: - id: licenseheaders args: ["--tmpl=LICENSE_SHORT", "--ext=py", "-f"] - - repo: https://github.com/ambv/black - rev: 21.11b1 + - repo: https://github.com/psf/black + rev: 22.3.0 hooks: - id: black + args: ["--config", "pyproject.toml"] - repo: https://github.com/timothycrosley/isort rev: 5.10.1 hooks: - id: isort - args: ["--dont-follow-links", "--jobs=-1"] + args: + [ + "--settings-file", + "pyproject.toml", + "--dont-follow-links", + "--jobs=-1", + ] additional_dependencies: ["colorama"] - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.5.0 @@ -34,12 +41,14 @@ repos: rev: v0.0.1a2.post1 hooks: - id: pyproject-flake8 + args: ["--config", "pyproject.toml"] additional_dependencies: ["mccabe"] alias: flake8 - repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.910-1 + rev: v0.930 hooks: - id: mypy + args: ["--config-file", "pyproject.toml"] exclude: | (?x)^.*( octavia-cli/unit_tests/| diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000000000..8193c5583a6f6 --- /dev/null +++ b/.prettierignore @@ -0,0 +1 @@ +airbyte-integrations/bases/base-normalization/integration_tests/normalization_test_output diff --git a/.python-version b/.python-version index c77a7de85cc88..a9f8d1be337f7 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.7.9 +3.9.11 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 73602b5138e03..5fb58f45882df 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,6 +5,11 @@ # Required version: 2 +build: + os: ubuntu-20.04 + tools: + python: "3.9" + # Build documentation in the docs/ directory with Sphinx sphinx: builder: html @@ -12,7 +17,6 @@ sphinx: # Set the version of Python and requirements required to build your docs python: - version: "3.7" install: - method: pip path: airbyte-cdk/python diff --git a/.vscode/frontend.code-workspace b/.vscode/frontend.code-workspace new file mode 100644 index 0000000000000..8ec2e7d144b29 --- /dev/null +++ b/.vscode/frontend.code-workspace @@ -0,0 +1,54 @@ +{ + "folders": [ + { + "path": "../airbyte-webapp" + }, + { + "path": "../airbyte-webapp-e2e-tests" + } + ], + "extensions": { + "recommendations": [ + "dbaeumer.vscode-eslint", + "esbenp.prettier-vscode", + "ms-vsliveshare.vsliveshare", + "eamodio.gitlens" + ] + }, + "settings": { + "javascript.preferences.quoteStyle": "double", + "typescript.preferences.quoteStyle": "double", + "javascript.preferences.importModuleSpecifier": "shortest", + "typescript.preferences.importModuleSpecifier": "shortest", + "javascript.updateImportsOnFileMove.enabled": "always", + "typescript.updateImportsOnFileMove.enabled": "always", + "editor.detectIndentation": true, + "eslint.format.enable": true, + "eslint.run": "onType", + "[javascript]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[typescript]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[typescriptreact]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[json]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "esbenp.prettier-vscode" + } + } +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000..f033cb881e892 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,36 @@ +{ + "javascript.preferences.quoteStyle": "double", + "typescript.preferences.quoteStyle": "double", + "javascript.preferences.importModuleSpecifier": "shortest", + "typescript.preferences.importModuleSpecifier": "shortest", + "javascript.updateImportsOnFileMove.enabled": "always", + "typescript.updateImportsOnFileMove.enabled": "always", + "editor.detectIndentation": true, + "eslint.format.enable": true, + "eslint.run": "onType", + "[javascript]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[typescript]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[typescriptreact]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "dbaeumer.vscode-eslint", + "editor.codeActionsOnSave": { + "source.organizeImports": false + } + }, + "[json]": { + "editor.formatOnSave": true, + "editor.defaultFormatter": "esbenp.prettier-vscode" + } +} diff --git a/LICENSE_SHORT b/LICENSE_SHORT index ddae2fe7c9b76..310dba089a8ab 100644 --- a/LICENSE_SHORT +++ b/LICENSE_SHORT @@ -1 +1 @@ -Copyright (c) 2021 Airbyte, Inc., all rights reserved. +Copyright (c) 2022 Airbyte, Inc., all rights reserved. diff --git a/README.md b/README.md index 871157a17068a..9d25fedb7ab38 100644 --- a/README.md +++ b/README.md @@ -2,15 +2,9 @@ [![GitHub stars](https://img.shields.io/github/stars/airbytehq/airbyte?style=social&label=Star&maxAge=2592000)](https://GitHub.com/airbytehq/airbyte/stargazers/) [![GitHub Workflow Status](https://img.shields.io/github/workflow/status/airbytehq/airbyte/Airbyte%20CI)](https://github.com/airbytehq/airbyte/actions/workflows/gradle.yml) [![License](https://img.shields.io/static/v1?label=license&message=MIT&color=brightgreen)](https://github.com/airbytehq/airbyte/tree/a9b1c6c0420550ad5069aca66c295223e0d05e27/LICENSE/README.md) [![License](https://img.shields.io/static/v1?label=license&message=ELv2&color=brightgreen)](https://github.com/airbytehq/airbyte/tree/a9b1c6c0420550ad5069aca66c295223e0d05e27/LICENSE/README.md) -![](docs/.gitbook/assets/airbyte_new_logo.svg) - -**Data integration made simple, secure and extensible.** +**Data integration made simple, secure and extensible.** The new open-source standard to sync data from applications, APIs & databases to warehouses, lakes & other destinations. -[![](docs/.gitbook/assets/deploy_locally.svg)](docs/deploying-airbyte/local-deployment.md) [![](docs/.gitbook/assets/deploy_on_aws.svg)](docs/deploying-airbyte/on-aws-ec2.md) [![](docs/.gitbook/assets/deploy_on_gcp.svg)](docs/deploying-airbyte/on-gcp-compute-engine.md) [![](docs/.gitbook/assets/deploy_on_k8s.svg)](docs/deploying-airbyte/on-kubernetes.md) - -![](docs/.gitbook/assets/airbyte-ui-for-your-integration-pipelines.png) - Airbyte is on a mission to make data integration pipelines a commodity. * **Maintenance-free connectors you can use in minutes**. Just authenticate your sources and warehouse, and get connectors that adapt to schema and API changes for you. @@ -50,7 +44,7 @@ Here is a [step-by-step guide](https://github.com/airbytehq/airbyte/tree/e378d40 We love contributions to Airbyte, big or small. -See our [Contributing guide](docs/contributing-to-airbyte/) on how to get started. Not sure where to start? We’ve listed some [good first issues](https://github.com/airbytehq/airbyte/labels/good%20first%20issue) to start with. If you have any questions, please open a draft PR or visit our [slack channel](https://github.com/airbytehq/airbyte/tree/a9b1c6c0420550ad5069aca66c295223e0d05e27/slack.airbyte.io) where the core team can help answer your questions. +See our [Contributing guide](docs/contributing-to-airbyte/README.md) on how to get started. Not sure where to start? We’ve listed some [good first issues](https://github.com/airbytehq/airbyte/labels/good%20first%20issue) to start with. If you have any questions, please open a draft PR or visit our [slack channel](https://github.com/airbytehq/airbyte/tree/a9b1c6c0420550ad5069aca66c295223e0d05e27/slack.airbyte.io) where the core team can help answer your questions. **Note that you are able to create connectors using the language you want, as Airbyte connections run as Docker containers.** @@ -61,13 +55,14 @@ See our [Contributing guide](docs/contributing-to-airbyte/) on how to get starte For general help using Airbyte, please refer to the official Airbyte documentation. For additional help, you can use one of these channels to ask a question: * [Slack](https://slack.airbyte.io) \(For live discussion with the Community and Airbyte team\) +* [Forum](https://discuss.airbyte.io/) \(For deeper converstaions about features, connectors, or problems\) * [GitHub](https://github.com/airbytehq/airbyte) \(Bug reports, Contributions\) * [Twitter](https://twitter.com/airbytehq) \(Get the news fast\) * [Weekly office hours](https://airbyte.io/weekly-office-hours/) \(Live informal 30-minute video call sessions with the Airbyte team\) ## Roadmap -Check out our [roadmap](docs/project-overview/roadmap.md) to get informed on what we are currently working on, and what we have in mind for the next weeks, months and years. +Check out our [roadmap](https://app.harvestr.io/roadmap/view/pQU6gdCyc/launch-week-roadmap) to get informed on what we are currently working on, and what we have in mind for the next weeks, months and years. ## License diff --git a/airbyte-analytics/build.gradle b/airbyte-analytics/build.gradle index a405e5756b684..bfc2ed4f3cc81 100644 --- a/airbyte-analytics/build.gradle +++ b/airbyte-analytics/build.gradle @@ -2,7 +2,9 @@ dependencies { implementation 'com.segment.analytics.java:analytics:2.1.1' - implementation project(':airbyte-config:models') - implementation project(':airbyte-config:persistence') + implementation project(':airbyte-config:config-models') + implementation project(':airbyte-config:config-persistence') implementation project(':airbyte-json-validation') } + +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-analytics/readme.md b/airbyte-analytics/readme.md new file mode 100644 index 0000000000000..4c66fb35c24d8 --- /dev/null +++ b/airbyte-analytics/readme.md @@ -0,0 +1,3 @@ +# airbyte-analytics + +Java library with shared code for telemetry tracking including Segment. diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/Deployment.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/Deployment.java index dd92959917836..ae157d8ebb2ec 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/Deployment.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/Deployment.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/LoggingTrackingClient.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/LoggingTrackingClient.java index 5459c02feeda7..4b206f61e8407 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/LoggingTrackingClient.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/LoggingTrackingClient.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/SegmentTrackingClient.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/SegmentTrackingClient.java index 575b101a471ab..840b5f315e071 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/SegmentTrackingClient.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/SegmentTrackingClient.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClient.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClient.java index e123d806231b7..edc7627a07f1b 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClient.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClient.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClientSingleton.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClientSingleton.java index 27072a8fd4ba0..d64e6c4536ce5 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClientSingleton.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingClientSingleton.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; @@ -14,15 +14,20 @@ import java.io.IOException; import java.util.UUID; import java.util.function.Function; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class TrackingClientSingleton { + private static final Logger LOGGER = LoggerFactory.getLogger(LoggingTrackingClient.class); + private static final Object lock = new Object(); private static TrackingClient trackingClient; public static TrackingClient get() { synchronized (lock) { if (trackingClient == null) { + LOGGER.warn("Attempting to fetch an initialized track client. Initializing a default one."); initialize(); } return trackingClient; diff --git a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingIdentity.java b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingIdentity.java index f7acd96296cd8..54703c9a27e89 100644 --- a/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingIdentity.java +++ b/airbyte-analytics/src/main/java/io/airbyte/analytics/TrackingIdentity.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-analytics/src/test/java/io/airbyte/analytics/SegmentTrackingClientTest.java b/airbyte-analytics/src/test/java/io/airbyte/analytics/SegmentTrackingClientTest.java index d6d0bb9eb0e94..2d91aba22b414 100644 --- a/airbyte-analytics/src/test/java/io/airbyte/analytics/SegmentTrackingClientTest.java +++ b/airbyte-analytics/src/test/java/io/airbyte/analytics/SegmentTrackingClientTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; @@ -35,6 +35,8 @@ class SegmentTrackingClientTest { private static final TrackingIdentity IDENTITY = new TrackingIdentity(AIRBYTE_VERSION, UUID.randomUUID(), EMAIL, false, false, true); private static final UUID WORKSPACE_ID = UUID.randomUUID(); private static final Function MOCK_TRACKING_IDENTITY = (workspaceId) -> IDENTITY; + private static final String AIRBYTE_VERSION_KEY = "airbyte_version"; + private static final String JUMP = "jump"; private Analytics analytics; private SegmentTrackingClient segmentTrackingClient; @@ -61,7 +63,7 @@ void testIdentify() { final IdentifyMessage actual = mockBuilder.getValue().build(); final Map expectedTraits = ImmutableMap.builder() .put("anonymized", IDENTITY.isAnonymousDataCollection()) - .put("airbyte_version", AIRBYTE_VERSION.serialize()) + .put(AIRBYTE_VERSION_KEY, AIRBYTE_VERSION.serialize()) .put("deployment_env", DEPLOYMENT.getDeploymentEnv()) .put("deployment_mode", DEPLOYMENT.getDeploymentMode()) .put("deployment_id", DEPLOYMENT.getDeploymentId()) @@ -87,7 +89,7 @@ void testIdentifyWithRole() { final IdentifyMessage actual = mockBuilder.getValue().build(); final Map expectedTraits = ImmutableMap.builder() .put("airbyte_role", "role") - .put("airbyte_version", AIRBYTE_VERSION.serialize()) + .put(AIRBYTE_VERSION_KEY, AIRBYTE_VERSION.serialize()) .put("anonymized", IDENTITY.isAnonymousDataCollection()) .put("deployment_env", DEPLOYMENT.getDeploymentEnv()) .put("deployment_mode", DEPLOYMENT.getDeploymentMode()) @@ -104,13 +106,13 @@ void testIdentifyWithRole() { void testTrack() { final ArgumentCaptor mockBuilder = ArgumentCaptor.forClass(TrackMessage.Builder.class); final ImmutableMap metadata = - ImmutableMap.of("airbyte_version", AIRBYTE_VERSION.serialize(), "user_id", IDENTITY.getCustomerId()); + ImmutableMap.of(AIRBYTE_VERSION_KEY, AIRBYTE_VERSION.serialize(), "user_id", IDENTITY.getCustomerId()); - segmentTrackingClient.track(WORKSPACE_ID, "jump"); + segmentTrackingClient.track(WORKSPACE_ID, JUMP); verify(analytics).enqueue(mockBuilder.capture()); final TrackMessage actual = mockBuilder.getValue().build(); - assertEquals("jump", actual.event()); + assertEquals(JUMP, actual.event()); assertEquals(IDENTITY.getCustomerId().toString(), actual.userId()); assertEquals(metadata, filterTrackedAtProperty(Objects.requireNonNull(actual.properties()))); } @@ -119,16 +121,16 @@ void testTrack() { void testTrackWithMetadata() { final ArgumentCaptor mockBuilder = ArgumentCaptor.forClass(TrackMessage.Builder.class); final ImmutableMap metadata = ImmutableMap.of( - "airbyte_version", AIRBYTE_VERSION.serialize(), + AIRBYTE_VERSION_KEY, AIRBYTE_VERSION.serialize(), "email", EMAIL, "height", "80 meters", "user_id", IDENTITY.getCustomerId()); - segmentTrackingClient.track(WORKSPACE_ID, "jump", metadata); + segmentTrackingClient.track(WORKSPACE_ID, JUMP, metadata); verify(analytics).enqueue(mockBuilder.capture()); final TrackMessage actual = mockBuilder.getValue().build(); - assertEquals("jump", actual.event()); + assertEquals(JUMP, actual.event()); assertEquals(IDENTITY.getCustomerId().toString(), actual.userId()); assertEquals(metadata, filterTrackedAtProperty(Objects.requireNonNull(actual.properties()))); } @@ -137,7 +139,7 @@ private static ImmutableMap filterTrackedAtProperty(final Map builder = ImmutableMap.builder(); properties.forEach((key, value) -> { - if (!key.equals("tracked_at")) { + if (!"tracked_at".equals(key)) { builder.put(key, value); } }); diff --git a/airbyte-analytics/src/test/java/io/airbyte/analytics/TrackingClientSingletonTest.java b/airbyte-analytics/src/test/java/io/airbyte/analytics/TrackingClientSingletonTest.java index 055b6fa52cdd4..fb6908c051e29 100644 --- a/airbyte-analytics/src/test/java/io/airbyte/analytics/TrackingClientSingletonTest.java +++ b/airbyte-analytics/src/test/java/io/airbyte/analytics/TrackingClientSingletonTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.analytics; diff --git a/airbyte-api/build.gradle b/airbyte-api/build.gradle index c083974052be5..f9314d1b0c646 100644 --- a/airbyte-api/build.gradle +++ b/airbyte-api/build.gradle @@ -17,9 +17,9 @@ task generateApiServer(type: GenerateTask) { inputSpec = specFile outputDir = serverOutputDir - apiPackage = "io.airbyte.api" - invokerPackage = "io.airbyte.api.invoker" - modelPackage = "io.airbyte.api.model" + apiPackage = "io.airbyte.api.generated" + invokerPackage = "io.airbyte.api.invoker.generated" + modelPackage = "io.airbyte.api.model.generated" importMappings = [ 'OAuthConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', @@ -28,7 +28,8 @@ task generateApiServer(type: GenerateTask) { 'DestinationDefinitionSpecification': 'com.fasterxml.jackson.databind.JsonNode', 'DestinationConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', 'StreamJsonSchema' : 'com.fasterxml.jackson.databind.JsonNode', - 'ConnectionStateObject' : 'com.fasterxml.jackson.databind.JsonNode', + 'StateBlob' : 'com.fasterxml.jackson.databind.JsonNode', + 'FieldSchema' : 'com.fasterxml.jackson.databind.JsonNode', ] generateApiDocumentation = false @@ -36,7 +37,15 @@ task generateApiServer(type: GenerateTask) { configOptions = [ dateLibrary : "java8", generatePom : "false", - interfaceOnly: "true" + interfaceOnly: "true", + /* + JAX-RS generator does not respect nullable properties defined in the OpenApi Spec. + It means that if a field is not nullable but not set it is still returning a null value for this field in the serialized json. + The below Jackson annotation is made to only keep non null values in serialized json. + We are not yet using nullable=true properties in our OpenApi so this is a valid workaround at the moment to circumvent the default JAX-RS behavior described above. + Feel free to read the conversation on https://github.com/airbytehq/airbyte/pull/13370 for more details. + */ + additionalModelTypeAnnotations: "\n@com.fasterxml.jackson.annotation.JsonInclude(com.fasterxml.jackson.annotation.JsonInclude.Include.NON_NULL)" ] } compileJava.dependsOn tasks.generateApiServer @@ -51,9 +60,9 @@ task generateApiClient(type: GenerateTask) { inputSpec = specFile outputDir = clientOutputDir - apiPackage = "io.airbyte.api.client" - invokerPackage = "io.airbyte.api.client.invoker" - modelPackage = "io.airbyte.api.client.model" + apiPackage = "io.airbyte.api.client.generated" + invokerPackage = "io.airbyte.api.client.invoker.generated" + modelPackage = "io.airbyte.api.client.model.generated" importMappings = [ 'OAuthConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', @@ -62,7 +71,8 @@ task generateApiClient(type: GenerateTask) { 'DestinationDefinitionSpecification': 'com.fasterxml.jackson.databind.JsonNode', 'DestinationConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', 'StreamJsonSchema' : 'com.fasterxml.jackson.databind.JsonNode', - 'ConnectionStateObject' : 'com.fasterxml.jackson.databind.JsonNode', + 'StateBlob' : 'com.fasterxml.jackson.databind.JsonNode', + 'FieldSchema' : 'com.fasterxml.jackson.databind.JsonNode', ] library = "native" @@ -84,9 +94,9 @@ task generateApiDocs(type: GenerateTask) { inputSpec = specFile outputDir = docsOutputDir - apiPackage = "io.airbyte.api.client" - invokerPackage = "io.airbyte.api.client.invoker" - modelPackage = "io.airbyte.api.client.model" + apiPackage = "io.airbyte.api.client.generated" + invokerPackage = "io.airbyte.api.client.invoker.generated" + modelPackage = "io.airbyte.api.client.model.generated" importMappings = [ 'OAuthConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', @@ -95,7 +105,8 @@ task generateApiDocs(type: GenerateTask) { 'DestinationDefinitionSpecification': 'com.fasterxml.jackson.databind.JsonNode', 'DestinationConfiguration' : 'com.fasterxml.jackson.databind.JsonNode', 'StreamJsonSchema' : 'com.fasterxml.jackson.databind.JsonNode', - 'ConnectionStateObject' : 'com.fasterxml.jackson.databind.JsonNode', + 'StateBlob' : 'com.fasterxml.jackson.databind.JsonNode', + 'FieldSchema' : 'com.fasterxml.jackson.databind.JsonNode', ] generateApiDocumentation = false @@ -143,3 +154,4 @@ sourceSets { } } +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-api/readme.md b/airbyte-api/readme.md new file mode 100644 index 0000000000000..33ffeeb918dd7 --- /dev/null +++ b/airbyte-api/readme.md @@ -0,0 +1,11 @@ +# airbyte-api + +Defines the OpenApi configuration for the Airbyte Configuration API. It also is responsible for generating the following from the API spec: +* Java API client +* Java API server - this generated code is used in `airbyte-server` to allow us to implement the Configuration API in a type safe way. See `ConfigurationApi.java` in `airbyte-server` +* API docs + +## Key Files +* src/openapi/config.yaml - Defines the config API interface using OpenApi3 +* AirbyteApiClient.java - wraps all api clients so that they can be dependency injected together +* PatchedLogsApi.java - fixes generated code for log api. diff --git a/airbyte-api/src/main/java/io/airbyte/api/client/AirbyteApiClient.java b/airbyte-api/src/main/java/io/airbyte/api/client/AirbyteApiClient.java index 341cc33779aa9..76945a0f0d5bf 100644 --- a/airbyte-api/src/main/java/io/airbyte/api/client/AirbyteApiClient.java +++ b/airbyte-api/src/main/java/io/airbyte/api/client/AirbyteApiClient.java @@ -1,10 +1,22 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.api.client; -import io.airbyte.api.client.invoker.ApiClient; +import io.airbyte.api.client.generated.ConnectionApi; +import io.airbyte.api.client.generated.DbMigrationApi; +import io.airbyte.api.client.generated.DestinationApi; +import io.airbyte.api.client.generated.DestinationDefinitionApi; +import io.airbyte.api.client.generated.DestinationDefinitionSpecificationApi; +import io.airbyte.api.client.generated.HealthApi; +import io.airbyte.api.client.generated.JobsApi; +import io.airbyte.api.client.generated.OperationApi; +import io.airbyte.api.client.generated.SourceApi; +import io.airbyte.api.client.generated.SourceDefinitionApi; +import io.airbyte.api.client.generated.SourceDefinitionSpecificationApi; +import io.airbyte.api.client.generated.WorkspaceApi; +import io.airbyte.api.client.invoker.generated.ApiClient; /** * This class is meant to consolidate all our API endpoints into a fluent-ish client. Currently, all diff --git a/airbyte-api/src/main/java/io/airbyte/api/client/PatchedLogsApi.java b/airbyte-api/src/main/java/io/airbyte/api/client/PatchedLogsApi.java index baa809452139a..221107722f80b 100644 --- a/airbyte-api/src/main/java/io/airbyte/api/client/PatchedLogsApi.java +++ b/airbyte-api/src/main/java/io/airbyte/api/client/PatchedLogsApi.java @@ -1,14 +1,14 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.api.client; import com.fasterxml.jackson.databind.ObjectMapper; -import io.airbyte.api.client.invoker.ApiClient; -import io.airbyte.api.client.invoker.ApiException; -import io.airbyte.api.client.invoker.ApiResponse; -import io.airbyte.api.client.model.LogsRequestBody; +import io.airbyte.api.client.invoker.generated.ApiClient; +import io.airbyte.api.client.invoker.generated.ApiException; +import io.airbyte.api.client.invoker.generated.ApiResponse; +import io.airbyte.api.client.model.generated.LogsRequestBody; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -21,9 +21,9 @@ import org.apache.commons.io.FileUtils; /** - * This class is a copy of {@link LogsApi} except it allows Accept: text/plain. Without this - * modification, {@link LogsApi} returns a 406 because the generated code requests the wrong - * response type. + * This class is a copy of {@link io.airbyte.api.client.generated.LogsApi} except it allows Accept: + * text/plain. Without this modification, {@link io.airbyte.api.client.generated.LogsApi} returns a + * 406 because the generated code requests the wrong response type. */ public class PatchedLogsApi { diff --git a/airbyte-api/src/main/openapi/config.yaml b/airbyte-api/src/main/openapi/config.yaml index 2db163fc947a5..da65057ffdba2 100644 --- a/airbyte-api/src/main/openapi/config.yaml +++ b/airbyte-api/src/main/openapi/config.yaml @@ -185,6 +185,29 @@ paths: $ref: "#/components/responses/NotFoundResponse" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/workspaces/update_name: + post: + tags: + - workspace + summary: Update workspace name + operationId: updateWorkspaceName + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceUpdateName" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" /v1/workspaces/tag_feedback_status_as_done: post: tags: @@ -336,6 +359,170 @@ paths: $ref: "#/components/responses/NotFoundResponse" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/list_private: + post: + tags: + - source_definition + summary: + List all private, non-custom sourceDefinitions, and for each indicate whether the given workspace has a grant for using the definition. Used + by admins to view and modify a given workspace's grants. + operationId: listPrivateSourceDefinitions + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceIdRequestBody" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/PrivateSourceDefinitionReadList" + /v1/source_definitions/list_for_workspace: + post: + tags: + - source_definition + summary: List all the sourceDefinitions the given workspace is configured to use + operationId: listSourceDefinitionsForWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceIdRequestBody" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionReadList" + /v1/source_definitions/create_custom: + post: + tags: + - source_definition + summary: Creates a custom sourceDefinition for the given workspace + operationId: createCustomSourceDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/CustomSourceDefinitionCreate" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionRead" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/get_for_workspace: + post: + tags: + - source_definition + summary: Get a sourceDefinition that is configured for the given workspace + operationId: getSourceDefinitionForWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionIdWithWorkspaceId" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/update_custom: + post: + tags: + - source_definition + summary: Update a custom sourceDefinition for the given workspace + operationId: updateCustomSourceDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/CustomSourceDefinitionUpdate" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/delete_custom: + post: + tags: + - source_definition + summary: Delete a custom source definition for the given workspace + operationId: deleteCustomSourceDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionIdWithWorkspaceId" + required: true + responses: + "204": + description: The resource was deleted successfully. + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/grant_definition: + post: + tags: + - source_definition + summary: grant a private, non-custom sourceDefinition to a given workspace + operationId: grantSourceDefinitionToWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionIdWithWorkspaceId" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/PrivateSourceDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/source_definitions/revoke_definition: + post: + tags: + - source_definition + summary: revoke a grant to a private, non-custom sourceDefinition from a given workspace + operationId: revokeSourceDefinitionFromWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SourceDefinitionIdWithWorkspaceId" + required: true + responses: + "204": + description: The resource was deleted successfully. + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" /v1/source_definition_specifications/get: post: tags: @@ -346,7 +533,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SourceDefinitionIdRequestBody" + $ref: "#/components/schemas/SourceDefinitionIdWithWorkspaceId" required: true responses: "200": @@ -471,6 +658,29 @@ paths: $ref: "#/components/schemas/SourceReadList" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/sources/clone: + post: + tags: + - source + summary: Clone source + operationId: cloneSource + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/SourceCloneRequestBody" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/SourceRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" /v1/sources/delete: post: tags: @@ -546,7 +756,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/SourceIdRequestBody" + $ref: "#/components/schemas/SourceDiscoverSchemaRequestBody" required: true responses: "200": @@ -671,6 +881,170 @@ paths: $ref: "#/components/responses/NotFoundResponse" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/list_private: + post: + tags: + - destination_definition + summary: + List all private, non-custom destinationDefinitions, and for each indicate whether the given workspace has a grant for using the + definition. Used by admins to view and modify a given workspace's grants. + operationId: listPrivateDestinationDefinitions + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceIdRequestBody" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/PrivateDestinationDefinitionReadList" + /v1/destination_definitions/list_for_workspace: + post: + tags: + - destination_definition + summary: List all the destinationDefinitions the given workspace is configured to use + operationId: listDestinationDefinitionsForWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WorkspaceIdRequestBody" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionReadList" + /v1/destination_definitions/create_custom: + post: + tags: + - destination_definition + summary: Creates a custom destinationDefinition for the given workspace + operationId: createCustomDestinationDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/CustomDestinationDefinitionCreate" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionRead" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/get_for_workspace: + post: + tags: + - destination_definition + summary: Get a destinationDefinition that is configured for the given workspace + operationId: getDestinationDefinitionForWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionIdWithWorkspaceId" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/update_custom: + post: + tags: + - destination_definition + summary: Update a custom destinationDefinition for the given workspace + operationId: updateCustomDestinationDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/CustomDestinationDefinitionUpdate" + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/delete_custom: + post: + tags: + - destination_definition + summary: Delete a custom destination definition for the given workspace + operationId: deleteCustomDestinationDefinition + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionIdWithWorkspaceId" + required: true + responses: + "204": + description: The destination was deleted successfully. + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/grant_definition: + post: + tags: + - destination_definition + summary: grant a private, non-custom destinationDefinition to a given workspace + operationId: grantDestinationDefinitionToWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionIdWithWorkspaceId" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/PrivateDestinationDefinitionRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/destination_definitions/revoke_definition: + post: + tags: + - destination_definition + summary: revoke a grant to a private, non-custom destinationDefinition from a given workspace + operationId: revokeDestinationDefinitionFromWorkspace + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationDefinitionIdWithWorkspaceId" + required: true + responses: + "204": + description: The resource was deleted successfully. + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" /v1/destination_definition_specifications/get: post: tags: @@ -681,7 +1055,7 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/DestinationDefinitionIdRequestBody" + $ref: "#/components/schemas/DestinationDefinitionIdWithWorkspaceId" required: true responses: "200": @@ -869,6 +1243,29 @@ paths: $ref: "#/components/responses/NotFoundResponse" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/destinations/clone: + post: + tags: + - destination + summary: Clone destination + operationId: cloneDestination + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationCloneRequestBody" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/DestinationRead" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" /v1/connections/create: post: tags: @@ -1578,6 +1975,27 @@ paths: $ref: "#/components/schemas/WebBackendConnectionRead" "422": $ref: "#/components/responses/InvalidInputResponse" + /v1/web_backend/connections/updateNew: + post: + operationId: webBackendUpdateConnectionNew + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WebBackendConnectionUpdate" + required: true + responses: + "200": + content: + application/json: + schema: + $ref: "#/components/schemas/WebBackendConnectionRead" + description: Successful operation + "422": + $ref: "#/components/responses/InvalidInputResponse" + summary: Update a connection + tags: + - web_backend /v1/web_backend/connections/search: post: tags: @@ -1588,15 +2006,60 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/WebBackendConnectionSearch" - required: true + $ref: "#/components/schemas/WebBackendConnectionSearch" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/WebBackendConnectionReadList" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/web_backend/state/get_type: + post: + tags: + - connection + summary: Fetch the current state type for a connection. + operationId: getStateType + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/ConnectionIdRequestBody" + required: true + responses: + "200": + description: Successful operation + content: + application/json: + schema: + $ref: "#/components/schemas/ConnectionStateType" + "404": + $ref: "#/components/responses/NotFoundResponse" + "422": + $ref: "#/components/responses/InvalidInputResponse" + /v1/web_backend/workspace/state: + post: + tags: + - web_backend + summary: Returns the current state of a workspace + operationId: webBackendGetWorkspaceState + requestBody: + content: + application/json: + schema: + $ref: "#/components/schemas/WebBackendWorkspaceState" responses: "200": description: Successful operation content: application/json: schema: - $ref: "#/components/schemas/WebBackendConnectionReadList" + $ref: "#/components/schemas/WebBackendWorkspaceStateResult" + "404": + $ref: "#/components/responses/NotFoundResponse" "422": $ref: "#/components/responses/InvalidInputResponse" /v1/jobs/list: @@ -1893,6 +2356,8 @@ components: default: true slackConfiguration: $ref: "#/components/schemas/SlackNotificationConfiguration" + customerioConfiguration: + $ref: "#/components/schemas/CustomerioNotificationConfiguration" SlackNotificationConfiguration: type: object required: @@ -1900,11 +2365,13 @@ components: properties: webhook: type: string + CustomerioNotificationConfiguration: + type: object NotificationType: type: string enum: - slack - # - email + - customerio # - webhook NotificationRead: type: object @@ -1972,6 +2439,16 @@ components: type: boolean feedbackDone: type: boolean + WorkspaceUpdateName: + type: object + required: + - workspaceId + - name + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + name: + type: string WorkspaceUpdate: type: object required: @@ -2007,6 +2484,26 @@ components: properties: workspaceId: $ref: "#/components/schemas/WorkspaceId" + WebBackendWorkspaceState: + type: object + required: + - workspaceId + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + WebBackendWorkspaceStateResult: + type: object + required: + - hasConnections + - hasSources + - hasDestinations + properties: + hasConnections: + type: boolean + hasSources: + type: boolean + hasDestinations: + type: boolean # SLUG SlugRequestBody: type: object @@ -2045,6 +2542,8 @@ components: format: uri icon: type: string + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" SourceDefinitionUpdate: type: object description: Update the SourceDefinition. Currently, the only allowed attribute to update is the default docker image version. @@ -2056,6 +2555,8 @@ components: $ref: "#/components/schemas/SourceDefinitionId" dockerImageTag: type: string + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" SourceDefinitionRead: type: object required: @@ -2077,6 +2578,21 @@ components: format: uri icon: type: string + releaseStage: + $ref: "#/components/schemas/ReleaseStage" + releaseDate: + description: The date when this connector was first released, in yyyy-mm-dd format. + type: string + format: date + sourceType: + type: string + enum: + - api + - file + - database + - custom + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" SourceDefinitionReadList: type: object required: @@ -2086,6 +2602,55 @@ components: type: array items: $ref: "#/components/schemas/SourceDefinitionRead" + CustomSourceDefinitionCreate: + type: object + required: + - workspaceId + - sourceDefinition + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + sourceDefinition: + $ref: "#/components/schemas/SourceDefinitionCreate" + CustomSourceDefinitionUpdate: + type: object + required: + - workspaceId + - sourceDefinition + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + sourceDefinition: + $ref: "#/components/schemas/SourceDefinitionUpdate" + SourceDefinitionIdWithWorkspaceId: + type: object + required: + - sourceDefinitionId + - workspaceId + properties: + sourceDefinitionId: + $ref: "#/components/schemas/SourceDefinitionId" + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + PrivateSourceDefinitionRead: + type: object + required: + - sourceDefinition + - granted + properties: + sourceDefinition: + $ref: "#/components/schemas/SourceDefinitionRead" + granted: + type: boolean + PrivateSourceDefinitionReadList: + type: object + required: + - sourceDefinitions + properties: + sourceDefinitions: + type: array + items: + $ref: "#/components/schemas/PrivateSourceDefinitionRead" # SOURCE SPECIFICATION SourceDefinitionSpecification: description: The specification for what values are required to configure the sourceDefinition. @@ -2178,6 +2743,23 @@ components: properties: sourceId: $ref: "#/components/schemas/SourceId" + SourceCloneRequestBody: + description: The values required to configure the source. The schema for this should have an id of the existing source along with the configuration you want to change in case. + type: object + required: + - sourceCloneId + properties: + sourceCloneId: + $ref: "#/components/schemas/SourceId" + sourceConfiguration: + $ref: "#/components/schemas/SourceCloneConfiguration" + SourceCloneConfiguration: + type: object + properties: + connectionConfiguration: + $ref: "#/components/schemas/SourceConfiguration" + name: + type: string SourceConfiguration: description: The values required to configure the source. The schema for this must match the schema return by source_definition_specifications/get for the source. example: { user: "charles" } @@ -2207,6 +2789,15 @@ components: $ref: "#/components/schemas/WorkspaceId" name: type: string + SourceDiscoverSchemaRequestBody: + type: object + required: + - sourceId + properties: + sourceId: + $ref: "#/components/schemas/SourceId" + disable_cache: + type: boolean SourceUpdate: type: object required: @@ -2261,6 +2852,9 @@ components: $ref: "#/components/schemas/AirbyteCatalog" jobInfo: $ref: "#/components/schemas/SynchronousJobRead" + catalogId: + type: string + format: uuid SourceSearch: type: object properties: @@ -2308,6 +2902,8 @@ components: format: uri icon: type: string + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" DestinationDefinitionUpdate: type: object required: @@ -2318,6 +2914,8 @@ components: $ref: "#/components/schemas/DestinationDefinitionId" dockerImageTag: type: string + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" DestinationDefinitionRead: type: object required: @@ -2340,6 +2938,14 @@ components: format: uri icon: type: string + releaseStage: + $ref: "#/components/schemas/ReleaseStage" + releaseDate: + description: The date when this connector was first released, in yyyy-mm-dd format. + type: string + format: date + resourceRequirements: + $ref: "#/components/schemas/ActorDefinitionResourceRequirements" DestinationDefinitionReadList: type: object required: @@ -2349,6 +2955,55 @@ components: type: array items: $ref: "#/components/schemas/DestinationDefinitionRead" + CustomDestinationDefinitionCreate: + type: object + required: + - workspaceId + - destinationDefinition + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + destinationDefinition: + $ref: "#/components/schemas/DestinationDefinitionCreate" + CustomDestinationDefinitionUpdate: + type: object + required: + - workspaceId + - destinationDefinition + properties: + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + destinationDefinition: + $ref: "#/components/schemas/DestinationDefinitionUpdate" + DestinationDefinitionIdWithWorkspaceId: + type: object + required: + - destinationDefinitionId + - workspaceId + properties: + destinationDefinitionId: + $ref: "#/components/schemas/DestinationDefinitionId" + workspaceId: + $ref: "#/components/schemas/WorkspaceId" + PrivateDestinationDefinitionRead: + type: object + required: + - destinationDefinition + - granted + properties: + destinationDefinition: + $ref: "#/components/schemas/DestinationDefinitionRead" + granted: + type: boolean + PrivateDestinationDefinitionReadList: + type: object + required: + - destinationDefinitions + properties: + destinationDefinitions: + type: array + items: + $ref: "#/components/schemas/PrivateDestinationDefinitionRead" # DESTINATION DEFINITION SPECIFICATION DestinationDefinitionSpecification: description: The specification for what values are required to configure the destinationDefinition. @@ -2433,6 +3088,23 @@ components: $ref: "#/components/schemas/DestinationConfiguration" name: type: string + DestinationCloneRequestBody: + description: The values required to configure the destination. The schema for this should have an id of the existing destination along with the configuration you want to change in case. + type: object + required: + - destinationCloneId + properties: + destinationCloneId: + $ref: "#/components/schemas/DestinationId" + destinationConfiguration: + $ref: "#/components/schemas/DestinationCloneConfiguration" + DestinationCloneConfiguration: + type: object + properties: + connectionConfiguration: + $ref: "#/components/schemas/DestinationConfiguration" + name: + type: string DestinationRead: type: object required: @@ -2479,6 +3151,14 @@ components: type: string destinationName: type: string + # SOURCE / DESTINATION RELEASE STAGE ENUM + ReleaseStage: + type: string + enum: + - alpha + - beta + - generally_available + - custom # CONNECTION ConnectionId: type: string @@ -2542,10 +3222,12 @@ components: $ref: "#/components/schemas/ConnectionStatus" resourceRequirements: $ref: "#/components/schemas/ResourceRequirements" + sourceCatalogId: + type: string + format: uuid WebBackendConnectionCreate: type: object required: - - connection - sourceId - destinationId - status @@ -2583,6 +3265,9 @@ components: type: array items: $ref: "#/components/schemas/OperationCreate" + sourceCatalogId: + type: string + format: uuid ConnectionUpdate: type: object required: @@ -2599,6 +3284,9 @@ components: description: Used when namespaceDefinition is 'customformat'. If blank then behaves like namespaceDefinition = 'destination'. If "${SOURCE_NAMESPACE}" then behaves like namespaceDefinition = 'source'. default: null example: "${SOURCE_NAMESPACE}" + name: + type: string + description: Name that will be set to this connection prefix: type: string description: Prefix that will be prepended to the name of each stream when it is written to the destination. @@ -2614,6 +3302,9 @@ components: $ref: "#/components/schemas/ConnectionStatus" resourceRequirements: $ref: "#/components/schemas/ResourceRequirements" + sourceCatalogId: + type: string + format: uuid WebBackendConnectionUpdate: type: object required: @@ -2621,6 +3312,9 @@ components: - syncCatalog - status properties: + name: + type: string + description: Name that will be set to the connection connectionId: $ref: "#/components/schemas/ConnectionId" namespaceDefinition: @@ -2647,10 +3341,15 @@ components: $ref: "#/components/schemas/ResourceRequirements" withRefreshedCatalog: type: boolean + skipReset: + type: boolean operations: type: array items: $ref: "#/components/schemas/WebBackendOperationCreateOrUpdate" + sourceCatalogId: + type: string + format: uuid ConnectionRead: type: object required: @@ -2686,13 +3385,14 @@ components: syncCatalog: $ref: "#/components/schemas/AirbyteCatalog" schedule: - nullable: true - allOf: - - $ref: "#/components/schemas/ConnectionSchedule" + $ref: "#/components/schemas/ConnectionSchedule" status: $ref: "#/components/schemas/ConnectionStatus" resourceRequirements: $ref: "#/components/schemas/ResourceRequirements" + sourceCatalogId: + type: string + format: uuid ConnectionSearch: type: object properties: @@ -2800,9 +3500,9 @@ components: OperationIdRequestBody: type: object required: - - OperationId + - operationId properties: - OperationId: + operationId: $ref: "#/components/schemas/OperationId" OperationCreate: type: object @@ -2972,7 +3672,6 @@ components: type: string description: Stream's name. jsonSchema: - description: Stream schema using Json Schema specs. $ref: "#/components/schemas/StreamJsonSchema" supportedSyncModes: type: array @@ -2997,6 +3696,7 @@ components: type: string description: Optional Source-defined namespace. Airbyte streams from the same sources should have the same namespace. Currently only used by JDBC destinations to determine what schema to write to. StreamJsonSchema: + description: Stream schema using Json Schema specs. type: object AirbyteStreamConfiguration: description: the mutable part of the stream to configure the destination @@ -3061,7 +3761,6 @@ components: configId: type: string pagination: - type: object $ref: "#/components/schemas/Pagination" JobIdRequestBody: type: object @@ -3094,6 +3793,25 @@ components: format: int64 status: $ref: "#/components/schemas/JobStatus" + resetConfig: + $ref: "#/components/schemas/ResetConfig" + ResetConfig: + type: object + description: contains information about how a reset was configured. only populated if the job was a reset. + properties: + streamsToReset: + type: array + items: + $ref: "#/components/schemas/StreamDescriptor" + StreamDescriptor: + type: object + required: + - name + properties: + name: + type: string + namespace: + type: string JobDebugRead: type: object required: @@ -3171,6 +3889,8 @@ components: type: array items: $ref: "#/components/schemas/AttemptStreamStats" + failureSummary: + $ref: "#/components/schemas/AttemptFailureSummary" AttemptStats: type: object properties: @@ -3196,6 +3916,57 @@ components: type: string stats: $ref: "#/components/schemas/AttemptStats" + AttemptFailureSummary: + type: object + required: + - failures + properties: + failures: + type: array + items: + $ref: "#/components/schemas/AttemptFailureReason" + partialSuccess: + description: True if the number of committed records for this attempt was greater than 0. False if 0 records were committed. If not set, the number of committed records is unknown. + type: boolean + AttemptFailureReason: + type: object + required: + - timestamp + properties: + failureOrigin: + $ref: "#/components/schemas/AttemptFailureOrigin" + failureType: + $ref: "#/components/schemas/AttemptFailureType" + externalMessage: + type: string + internalMessage: + type: string + stacktrace: + type: string + retryable: + description: True if it is known that retrying may succeed, e.g. for a transient failure. False if it is known that a retry will not succeed, e.g. for a configuration issue. If not set, retryable status is not well known. + type: boolean + timestamp: + type: integer + format: int64 + AttemptFailureOrigin: + description: Indicates where the error originated. If not set, the origin of error is not well known. + type: string + enum: + - source + - destination + - replication + - persistence + - normalization + - dbt + - airbyte_platform + AttemptFailureType: + description: Categorizes well known errors into types for programmatic handling. If not set, the type of error is not well known. + type: string + enum: + - config_error + - system_error + - manual_cancellation AttemptStatus: type: string enum: @@ -3314,15 +4085,165 @@ components: $ref: "#/components/schemas/SynchronousJobRead" ConnectionState: type: object + description: Contains the state for a connection. The stateType field identifies what type of state it is. Only the field corresponding to that type will be set, the rest will be null. If stateType=not_set, then none of the fields will be set. required: - connectionId + - stateType properties: + stateType: + $ref: "#/components/schemas/ConnectionStateType" connectionId: $ref: "#/components/schemas/ConnectionId" - state: - $ref: "#/components/schemas/ConnectionStateObject" - ConnectionStateObject: + state: # legacy state object + $ref: "#/components/schemas/StateBlob" + streamState: + type: array + items: + $ref: "#/components/schemas/StreamState" + globalState: + $ref: "#/components/schemas/GlobalState" + StateBlob: + type: object + StreamState: + type: object + required: + - streamDescriptor + properties: + streamDescriptor: + $ref: "#/components/schemas/StreamDescriptor" + streamState: + $ref: "#/components/schemas/StateBlob" + GlobalState: + type: object + required: + - streamStates + properties: + shared_state: + $ref: "#/components/schemas/StateBlob" + streamStates: + type: array + items: + $ref: "#/components/schemas/StreamState" + ConnectionStateType: + type: string + enum: + - global + - stream + - legacy + - not_set + CatalogDiff: + type: object + description: Describes the difference between two Airbyte catalogs. + required: + - transforms + properties: + transforms: + description: list of stream transformations. order does not matter. + type: array + items: + $ref: "#/components/schemas/StreamTransform" + StreamTransform: + type: object + required: + - transformType + - streamDescriptor + properties: + transformType: + type: string + enum: + - add_stream + - remove_stream + - update_stream + streamDescriptor: + $ref: "#/components/schemas/StreamDescriptor" + updateStream: + type: array + description: list of field transformations. order does not matter. + items: + $ref: "#/components/schemas/FieldTransform" + FieldTransform: + type: object + description: "Describes the difference between two Streams." + required: + - transformType + - fieldName + properties: + transformType: + type: string + enum: + - add_field + - remove_field + - update_field_schema + fieldName: + $ref: "#/components/schemas/FieldName" + addField: + $ref: "#/components/schemas/FieldAdd" + removeField: + $ref: "#/components/schemas/FieldRemove" + updateFieldSchema: + $ref: "#/components/schemas/FieldSchemaUpdate" + FieldAdd: + type: object + properties: + schema: + $ref: "#/components/schemas/FieldSchema" + FieldRemove: + type: object + properties: + schema: + $ref: "#/components/schemas/FieldSchema" + FieldSchemaUpdate: + type: object + required: + - oldSchema + - newSchema + properties: + oldSchema: + $ref: "#/components/schemas/FieldSchema" + newSchema: + $ref: "#/components/schemas/FieldSchema" + FieldName: + description: A field name is a list of strings that form the path to the field. + type: array + items: + type: string + FieldSchema: + description: JSONSchema representation of the field + type: object + ActorDefinitionResourceRequirements: + description: actor definition specific resource requirements. if default is set, these are the requirements that should be set for ALL jobs run for this actor definition. it is overriden by the job type specific configurations. if not set, the platform will use defaults. these values will be overriden by configuration at the connection level. + type: object + additionalProperties: false + properties: + default: + "$ref": "#/components/schemas/ResourceRequirements" + jobSpecific: + type: array + items: + "$ref": "#/components/schemas/JobTypeResourceLimit" + JobTypeResourceLimit: + description: sets resource requirements for a specific job type for an actor definition. these values override the default, if both are set. type: object + additionalProperties: false + required: + - jobType + - resourceRequirements + properties: + jobType: + "$ref": "#/components/schemas/JobType" + resourceRequirements: + "$ref": "#/components/schemas/ResourceRequirements" + JobType: + description: enum that describes the different types of jobs that the platform runs. + type: string + enum: + - get_spec + - check_connection + - discover_schema + - sync + - reset_connection + - connection_updater + - replicate ResourceRequirements: description: optional resource requirements to run workers (blank for unbounded allocations) type: object @@ -3399,10 +4320,8 @@ components: $ref: "#/components/schemas/DbMigrationRead" # OAuth OAuthConfiguration: - description: OAuth specific blob. + description: The values required to configure OAuth flows. The schema for this must match the `OAuthConfigSpecification.oauthUserInputFromConnectorConfigSpecification` schema. OAuthInputConfiguration: - description: The values required to configure OAuth flows. - The schema for this must match the `OAuthConfigSpecification.oauthUserInputFromConnectorConfigSpecification` schema. $ref: "#/components/schemas/OAuthConfiguration" AdvancedAuth: type: object @@ -3669,6 +4588,11 @@ components: type: boolean resourceRequirements: $ref: "#/components/schemas/ResourceRequirements" + catalogId: + type: string + format: uuid + catalogDiff: + $ref: "#/components/schemas/CatalogDiff" WebBackendConnectionReadList: type: object required: diff --git a/airbyte-bootloader/Dockerfile b/airbyte-bootloader/Dockerfile index ee6e05f44f4f1..1b80a6c16ac0b 100644 --- a/airbyte-bootloader/Dockerfile +++ b/airbyte-bootloader/Dockerfile @@ -1,10 +1,15 @@ -ARG JDK_VERSION=17.0.1 -FROM openjdk:${JDK_VERSION}-slim +ARG JDK_VERSION=19-slim-bullseye +ARG JDK_IMAGE=openjdk:${JDK_VERSION} +FROM ${JDK_IMAGE} + +ARG VERSION=0.40.0-alpha ENV APPLICATION airbyte-bootloader +ENV VERSION ${VERSION} WORKDIR /app -ADD bin/${APPLICATION}-0.35.12-alpha.tar /app +ADD bin/${APPLICATION}-${VERSION}.tar /app + -ENTRYPOINT ["/bin/bash", "-c", "${APPLICATION}-0.35.12-alpha/bin/${APPLICATION}"] +ENTRYPOINT ["/bin/bash", "-c", "${APPLICATION}-${VERSION}/bin/${APPLICATION}"] diff --git a/airbyte-bootloader/build.gradle b/airbyte-bootloader/build.gradle index 13b1a08c6f615..211465e583eff 100644 --- a/airbyte-bootloader/build.gradle +++ b/airbyte-bootloader/build.gradle @@ -1,71 +1,27 @@ plugins { id 'application' - id 'maven-publish' - id 'com.github.johnrengelman.shadow' version '7.1.0' } dependencies { implementation project(':airbyte-config:init') - implementation project(':airbyte-config:models') - implementation project(':airbyte-config:persistence') - implementation project(':airbyte-db:lib') + implementation project(':airbyte-config:config-models') + implementation project(':airbyte-config:config-persistence') + implementation project(':airbyte-db:db-lib') implementation project(":airbyte-json-validation") - implementation project(':airbyte-scheduler:persistence') - implementation project(':airbyte-scheduler:models') + implementation project(':airbyte-protocol:protocol-models') + implementation project(':airbyte-scheduler:scheduler-persistence') - implementation 'io.temporal:temporal-sdk:1.6.0' - implementation "org.flywaydb:flyway-core:7.14.0" + implementation 'io.temporal:temporal-sdk:1.8.1' + implementation libs.flyway.core - testImplementation "org.testcontainers:postgresql:1.15.3" + testImplementation libs.platform.testcontainers.postgresql testImplementation 'uk.org.webcompere:system-stubs-jupiter:1.2.0' } application { applicationName = "airbyte-bootloader" mainClass = 'io.airbyte.bootloader.BootloaderApp' - applicationDefaultJvmArgs = ['-XX:MaxRAMPercentage=75.0'] -} - -// Publish this so Airbyte Cloud can consume and extend the classes within this jar. -// This needs to be a shadow jar as none of the other modules are published. -shadowJar { - zip64 true - mergeServiceFiles() - exclude 'META-INF/*.RSA' - exclude 'META-INF/*.SF' - exclude 'META-INF/*.DSA' - // Not stubbing this out adds 'all' to the end of the jar's name. - classifier = '' -} - -publishing { - publications { - shadow(MavenPublication) { publication -> - project.shadow.component(publication) - } - } - - repositories { - publications { - // This block is present so Gradle knows to publish a Maven jar. - maven(MavenPublication) { - from components.java - // Gradle will by default use the subproject path as the group id and the subproject name as the artifact id. - // e.g. the subproject :airbyte-scheduler:models is imported at io.airbyte.airbyte-config:persistence:. - } - } - - maven { - credentials { - name 'cloudrepo' - username System.getenv('CLOUDREPO_USER') - password System.getenv('CLOUDREPO_PASSWORD') - } - url 'https://airbyte.mycloudrepo.io/repositories/airbyte-public-jars' - } - - mavenLocal() - } + applicationDefaultJvmArgs = ['-XX:+ExitOnOutOfMemoryError', '-XX:MaxRAMPercentage=75.0'] } task copyGeneratedTar(type: Copy) { @@ -78,6 +34,15 @@ task copyGeneratedTar(type: Copy) { into 'build/docker/bin' } -Task dockerBuildTask = getDockerBuildTask("bootloader", "$project.projectDir") +Task dockerBuildTask = getDockerBuildTask("bootloader", "$project.projectDir", "$rootProject.ext.version", "$rootProject.ext.image_tag") dockerBuildTask.dependsOn(copyGeneratedTar) assemble.dependsOn(dockerBuildTask) + +// produce reproducible archives +// (see https://docs.gradle.org/current/userguide/working_with_files.html#sec:reproducible_archives) +tasks.withType(AbstractArchiveTask) { + preserveFileTimestamps = false + reproducibleFileOrder = true +} + +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-bootloader/readme.md b/airbyte-bootloader/readme.md new file mode 100644 index 0000000000000..c27261073832f --- /dev/null +++ b/airbyte-bootloader/readme.md @@ -0,0 +1,6 @@ +# airbyte-bootloader + +This application runs at start up for Airbyte. It is responsible for making sure that the environment is upgraded and in a good state. e.g. It makes sure the database has been migrated to the correct version. + +## Entrypoint +* BootloaderApp.java - has the main method for running the bootloader. diff --git a/airbyte-bootloader/src/main/java/io/airbyte/bootloader/BootloaderApp.java b/airbyte-bootloader/src/main/java/io/airbyte/bootloader/BootloaderApp.java index ab7e0f5ec4338..a0211cd3677dc 100644 --- a/airbyte-bootloader/src/main/java/io/airbyte/bootloader/BootloaderApp.java +++ b/airbyte-bootloader/src/main/java/io/airbyte/bootloader/BootloaderApp.java @@ -1,37 +1,43 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.bootloader; -import com.google.common.annotations.VisibleForTesting; import io.airbyte.commons.features.EnvVariableFeatureFlags; import io.airbyte.commons.features.FeatureFlags; +import io.airbyte.commons.lang.CloseableShutdownHook; import io.airbyte.commons.resources.MoreResources; import io.airbyte.commons.version.AirbyteVersion; import io.airbyte.config.Configs; import io.airbyte.config.EnvConfigs; import io.airbyte.config.StandardWorkspace; import io.airbyte.config.init.YamlSeedConfigPersistence; +import io.airbyte.config.persistence.ConfigPersistence; import io.airbyte.config.persistence.ConfigRepository; import io.airbyte.config.persistence.DatabaseConfigPersistence; +import io.airbyte.config.persistence.split_secrets.JsonSecretsProcessor; +import io.airbyte.config.persistence.split_secrets.SecretPersistence; import io.airbyte.db.Database; +import io.airbyte.db.factory.DSLContextFactory; +import io.airbyte.db.factory.DataSourceFactory; +import io.airbyte.db.factory.DatabaseCheckFactory; +import io.airbyte.db.factory.DatabaseDriver; +import io.airbyte.db.factory.FlywayFactory; +import io.airbyte.db.instance.DatabaseConstants; import io.airbyte.db.instance.DatabaseMigrator; -import io.airbyte.db.instance.configs.ConfigsDatabaseInstance; import io.airbyte.db.instance.configs.ConfigsDatabaseMigrator; -import io.airbyte.db.instance.jobs.JobsDatabaseInstance; import io.airbyte.db.instance.jobs.JobsDatabaseMigrator; -import io.airbyte.scheduler.models.Job; -import io.airbyte.scheduler.models.JobStatus; import io.airbyte.scheduler.persistence.DefaultJobPersistence; import io.airbyte.scheduler.persistence.JobPersistence; import io.airbyte.validation.json.JsonValidationException; -import io.temporal.client.WorkflowClient; -import io.temporal.serviceclient.WorkflowServiceStubs; -import io.temporal.serviceclient.WorkflowServiceStubsOptions; import java.io.IOException; import java.util.Optional; import java.util.UUID; +import javax.sql.DataSource; +import org.flywaydb.core.Flyway; +import org.jooq.DSLContext; +import org.jooq.SQLDialect; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,20 +53,26 @@ *

* - setting all required Airbyte metadata information. */ +@SuppressWarnings("PMD.UnusedPrivateField") public class BootloaderApp { private static final Logger LOGGER = LoggerFactory.getLogger(BootloaderApp.class); private static final AirbyteVersion VERSION_BREAK = new AirbyteVersion("0.32.0-alpha"); + private static final String DRIVER_CLASS_NAME = DatabaseDriver.POSTGRESQL.getDriverClassName(); private final Configs configs; - private Runnable postLoadExecution; - private FeatureFlags featureFlags; - - @VisibleForTesting - public BootloaderApp(Configs configs, FeatureFlags featureFlags) { - this.configs = configs; - this.featureFlags = featureFlags; - } + private final Runnable postLoadExecution; + private final FeatureFlags featureFlags; + private final SecretMigrator secretMigrator; + private ConfigPersistence configPersistence; + private ConfigPersistence yamlSeedConfigPersistence; + private Database configDatabase; + private Database jobDatabase; + private JobPersistence jobPersistence; + private final Flyway configsFlyway; + private final Flyway jobsFlyway; + private final DSLContext configsDslContext; + private final DSLContext jobsDslContext; /** * This method is exposed for Airbyte Cloud consumption. This lets us override the seed loading @@ -69,73 +81,172 @@ public BootloaderApp(Configs configs, FeatureFlags featureFlags) { * * @param configs * @param postLoadExecution + * @param featureFlags + * @param secretMigrator + * @param configsDslContext */ - public BootloaderApp(Configs configs, Runnable postLoadExecution, FeatureFlags featureFlags) { + public BootloaderApp(final Configs configs, + final Runnable postLoadExecution, + final FeatureFlags featureFlags, + final SecretMigrator secretMigrator, + final DSLContext configsDslContext, + final DSLContext jobsDslContext, + final Flyway configsFlyway, + final Flyway jobsFlyway) { this.configs = configs; this.postLoadExecution = postLoadExecution; this.featureFlags = featureFlags; + this.secretMigrator = secretMigrator; + this.configsDslContext = configsDslContext; + this.configsFlyway = configsFlyway; + this.jobsDslContext = jobsDslContext; + this.jobsFlyway = jobsFlyway; + + initPersistences(configsDslContext, jobsDslContext); } - public BootloaderApp() { - configs = new EnvConfigs(); + public BootloaderApp(final Configs configs, + final FeatureFlags featureFlags, + final SecretMigrator secretMigrator, + final DSLContext configsDslContext, + final DSLContext jobsDslContext, + final Flyway configsFlyway, + final Flyway jobsFlyway) { + this.configs = configs; + this.featureFlags = featureFlags; + this.secretMigrator = secretMigrator; + this.configsDslContext = configsDslContext; + this.configsFlyway = configsFlyway; + this.jobsDslContext = jobsDslContext; + this.jobsFlyway = jobsFlyway; + + initPersistences(configsDslContext, jobsDslContext); + postLoadExecution = () -> { try { - final Database configDatabase = - new ConfigsDatabaseInstance(configs.getConfigDatabaseUser(), configs.getConfigDatabasePassword(), configs.getConfigDatabaseUrl()) - .getAndInitialize(); - final DatabaseConfigPersistence configPersistence = new DatabaseConfigPersistence(configDatabase); - configPersistence.loadData(YamlSeedConfigPersistence.getDefault()); + configPersistence.loadData(yamlSeedConfigPersistence); + + if (featureFlags.forceSecretMigration() || !jobPersistence.isSecretMigrated()) { + if (this.secretMigrator != null) { + this.secretMigrator.migrateSecrets(); + LOGGER.info("Secrets successfully migrated."); + } + } LOGGER.info("Loaded seed data.."); - } catch (IOException e) { - e.printStackTrace(); + } catch (final IOException | JsonValidationException e) { + throw new RuntimeException(e); } }; - featureFlags = new EnvVariableFeatureFlags(); } public void load() throws Exception { - LOGGER.info("Setting up config database and default workspace.."); + LOGGER.info("Initializing databases..."); + DatabaseCheckFactory.createConfigsDatabaseInitializer(configsDslContext, + configs.getConfigsDatabaseInitializationTimeoutMs(), MoreResources.readResource(DatabaseConstants.CONFIGS_SCHEMA_PATH)).initialize(); - try ( - final Database configDatabase = - new ConfigsDatabaseInstance(configs.getConfigDatabaseUser(), configs.getConfigDatabasePassword(), configs.getConfigDatabaseUrl()) - .getAndInitialize(); - final Database jobDatabase = - new JobsDatabaseInstance(configs.getDatabaseUser(), configs.getDatabasePassword(), configs.getDatabaseUrl()).getAndInitialize()) { - LOGGER.info("Created initial jobs and configs database..."); + DatabaseCheckFactory.createJobsDatabaseInitializer(jobsDslContext, + configs.getJobsDatabaseInitializationTimeoutMs(), MoreResources.readResource(DatabaseConstants.JOBS_SCHEMA_PATH)).initialize(); + LOGGER.info("Databases initialized."); - final JobPersistence jobPersistence = new DefaultJobPersistence(jobDatabase); - final AirbyteVersion currAirbyteVersion = configs.getAirbyteVersion(); - assertNonBreakingMigration(jobPersistence, currAirbyteVersion); + LOGGER.info("Setting up config database and default workspace..."); + final JobPersistence jobPersistence = new DefaultJobPersistence(jobDatabase); + final AirbyteVersion currAirbyteVersion = configs.getAirbyteVersion(); + assertNonBreakingMigration(jobPersistence, currAirbyteVersion); - runFlywayMigration(configs, configDatabase, jobDatabase); - LOGGER.info("Ran Flyway migrations..."); + // TODO Will be converted to an injected singleton during DI migration + final DatabaseMigrator configDbMigrator = new ConfigsDatabaseMigrator(configDatabase, configsFlyway); + final DatabaseMigrator jobDbMigrator = new JobsDatabaseMigrator(jobDatabase, jobsFlyway); - final DatabaseConfigPersistence configPersistence = new DatabaseConfigPersistence(configDatabase); - final ConfigRepository configRepository = - new ConfigRepository(configPersistence.withValidation(), null, Optional.empty(), Optional.empty()); + runFlywayMigration(configs, configDbMigrator, jobDbMigrator); + LOGGER.info("Ran Flyway migrations."); - createWorkspaceIfNoneExists(configRepository); - LOGGER.info("Default workspace created.."); + final ConfigRepository configRepository = + new ConfigRepository(configPersistence, configDatabase); - createDeploymentIfNoneExists(jobPersistence); - LOGGER.info("Default deployment created.."); + createWorkspaceIfNoneExists(configRepository); + LOGGER.info("Default workspace created."); - jobPersistence.setVersion(currAirbyteVersion.serialize()); - LOGGER.info("Set version to {}", currAirbyteVersion); - } + createDeploymentIfNoneExists(jobPersistence); + LOGGER.info("Default deployment created."); - if (postLoadExecution != null) { - postLoadExecution.run(); - LOGGER.info("Finished running post load Execution.."); - } + jobPersistence.setVersion(currAirbyteVersion.serialize()); + LOGGER.info("Set version to {}", currAirbyteVersion); + + postLoadExecution.run(); + + LOGGER.info("Finished running post load Execution."); + + LOGGER.info("Finished bootstrapping Airbyte environment."); + } + + private static Database getConfigDatabase(final DSLContext dslContext) throws IOException { + return new Database(dslContext); + } + + private static ConfigPersistence getConfigPersistence(final Database configDatabase) throws IOException { + final JsonSecretsProcessor jsonSecretsProcessor = JsonSecretsProcessor.builder() + .maskSecrets(true) + .copySecrets(true) + .build(); + + return DatabaseConfigPersistence.createWithValidation(configDatabase, jsonSecretsProcessor); + } + + private static ConfigPersistence getYamlSeedConfigPersistence() throws IOException { + return new YamlSeedConfigPersistence(YamlSeedConfigPersistence.DEFAULT_SEED_DEFINITION_RESOURCE_CLASS); + } + + private static Database getJobDatabase(final DSLContext dslContext) throws IOException { + return new Database(dslContext); + } - LOGGER.info("Finished bootstrapping Airbyte environment.."); + private static JobPersistence getJobPersistence(final Database jobDatabase) throws IOException { + return new DefaultJobPersistence(jobDatabase); } - public static void main(String[] args) throws Exception { - final var bootloader = new BootloaderApp(); - bootloader.load(); + private void initPersistences(final DSLContext configsDslContext, final DSLContext jobsDslContext) { + try { + configDatabase = getConfigDatabase(configsDslContext); + configPersistence = getConfigPersistence(configDatabase); + yamlSeedConfigPersistence = getYamlSeedConfigPersistence(); + jobDatabase = getJobDatabase(jobsDslContext); + jobPersistence = getJobPersistence(jobDatabase); + } catch (final IOException e) { + LOGGER.error("Unable to initialize persistence.", e); + } + } + + public static void main(final String[] args) throws Exception { + final Configs configs = new EnvConfigs(); + final FeatureFlags featureFlags = new EnvVariableFeatureFlags(); + + // Manual configuration that will be replaced by Dependency Injection in the future + final DataSource configsDataSource = DataSourceFactory.create(configs.getConfigDatabaseUser(), configs.getConfigDatabasePassword(), + DRIVER_CLASS_NAME, configs.getConfigDatabaseUrl()); + final DataSource jobsDataSource = + DataSourceFactory.create(configs.getDatabaseUser(), configs.getDatabasePassword(), DRIVER_CLASS_NAME, configs.getDatabaseUrl()); + + try (final DSLContext configsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES); + final DSLContext jobsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES)) { + + // TODO Will be converted to an injected singleton during DI migration + final Database configDatabase = getConfigDatabase(configsDslContext); + final ConfigPersistence configPersistence = getConfigPersistence(configDatabase); + final Database jobDatabase = getJobDatabase(jobsDslContext); + final JobPersistence jobPersistence = getJobPersistence(jobDatabase); + final SecretMigrator secretMigrator = + new SecretMigrator(configPersistence, jobPersistence, SecretPersistence.getLongLived(configsDslContext, configs)); + final Flyway configsFlyway = FlywayFactory.create(configsDataSource, BootloaderApp.class.getSimpleName(), ConfigsDatabaseMigrator.DB_IDENTIFIER, + ConfigsDatabaseMigrator.MIGRATION_FILE_LOCATION); + final Flyway jobsFlyway = FlywayFactory.create(jobsDataSource, BootloaderApp.class.getSimpleName(), JobsDatabaseMigrator.DB_IDENTIFIER, + JobsDatabaseMigrator.MIGRATION_FILE_LOCATION); + + // Ensure that the database resources are closed on application shutdown + CloseableShutdownHook.registerRuntimeShutdownHook(configsDataSource, jobsDataSource, configsDslContext, jobsDslContext); + + final var bootloader = new BootloaderApp(configs, featureFlags, secretMigrator, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + bootloader.load(); + } } private static void createDeploymentIfNoneExists(final JobPersistence jobPersistence) throws IOException { @@ -167,7 +278,8 @@ private static void createWorkspaceIfNoneExists(final ConfigRepository configRep configRepository.writeStandardWorkspace(workspace); } - private static void assertNonBreakingMigration(JobPersistence jobPersistence, AirbyteVersion airbyteVersion) throws IOException { + private static void assertNonBreakingMigration(final JobPersistence jobPersistence, final AirbyteVersion airbyteVersion) + throws IOException { // version in the database when the server main method is called. may be empty if this is the first // time the server is started. LOGGER.info("Checking illegal upgrade.."); @@ -201,10 +313,7 @@ static boolean isLegalUpgrade(final AirbyteVersion airbyteDatabaseVersion, final return !isUpgradingThroughVersionBreak; } - private static void runFlywayMigration(final Configs configs, final Database configDatabase, final Database jobDatabase) { - final DatabaseMigrator configDbMigrator = new ConfigsDatabaseMigrator(configDatabase, BootloaderApp.class.getSimpleName()); - final DatabaseMigrator jobDbMigrator = new JobsDatabaseMigrator(jobDatabase, BootloaderApp.class.getSimpleName()); - + private static void runFlywayMigration(final Configs configs, final DatabaseMigrator configDbMigrator, final DatabaseMigrator jobDbMigrator) { configDbMigrator.createBaseline(); jobDbMigrator.createBaseline(); @@ -218,16 +327,4 @@ private static void runFlywayMigration(final Configs configs, final Database con } } - private static void cleanupZombies(final JobPersistence jobPersistence) throws IOException { - final Configs configs = new EnvConfigs(); - WorkflowClient wfClient = - WorkflowClient.newInstance(WorkflowServiceStubs.newInstance( - WorkflowServiceStubsOptions.newBuilder().setTarget(configs.getTemporalHost()).build())); - for (final Job zombieJob : jobPersistence.listJobsWithStatus(JobStatus.RUNNING)) { - LOGGER.info("Kill zombie job {} for connection {}", zombieJob.getId(), zombieJob.getScope()); - wfClient.newUntypedWorkflowStub("sync_" + zombieJob.getId()) - .terminate("Zombie"); - } - } - } diff --git a/airbyte-bootloader/src/main/java/io/airbyte/bootloader/SecretMigrator.java b/airbyte-bootloader/src/main/java/io/airbyte/bootloader/SecretMigrator.java new file mode 100644 index 0000000000000..a4a427b24dbbc --- /dev/null +++ b/airbyte-bootloader/src/main/java/io/airbyte/bootloader/SecretMigrator.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.bootloader; + +import static io.airbyte.config.persistence.split_secrets.SecretsHelpers.COORDINATE_FIELD; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.annotations.VisibleForTesting; +import io.airbyte.commons.json.JsonPaths; +import io.airbyte.commons.json.Jsons; +import io.airbyte.config.ConfigSchema; +import io.airbyte.config.DestinationConnection; +import io.airbyte.config.SourceConnection; +import io.airbyte.config.StandardDestinationDefinition; +import io.airbyte.config.StandardSourceDefinition; +import io.airbyte.config.persistence.ConfigPersistence; +import io.airbyte.config.persistence.split_secrets.SecretCoordinate; +import io.airbyte.config.persistence.split_secrets.SecretPersistence; +import io.airbyte.config.persistence.split_secrets.SecretsHelpers; +import io.airbyte.scheduler.persistence.JobPersistence; +import io.airbyte.validation.json.JsonValidationException; +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import lombok.AllArgsConstructor; +import lombok.Value; +import lombok.extern.slf4j.Slf4j; + +@AllArgsConstructor +@Slf4j +public class SecretMigrator { + + private final ConfigPersistence configPersistence; + private final JobPersistence jobPersistence; + private final Optional secretPersistence; + + @Value + static class ConnectorConfiguration { + + private final UUID workspace; + private final JsonNode configuration; + private final JsonNode spec; + + } + + /** + * Perform a secret migration. It will load all the actor specs extract the secret JsonPath from it. + * Then for all the secret that are stored in a plain text format, it will save the plain text in + * the secret manager and store the coordinate in the config DB. + */ + public void migrateSecrets() throws JsonValidationException, IOException { + if (secretPersistence.isEmpty()) { + log.info("No secret persistence is provided, the migration won't be run "); + + return; + } + final List standardSourceDefinitions = + configPersistence.listConfigs(ConfigSchema.STANDARD_SOURCE_DEFINITION, StandardSourceDefinition.class); + + final Map definitionIdToSourceSpecs = standardSourceDefinitions + .stream().collect(Collectors.toMap(StandardSourceDefinition::getSourceDefinitionId, + def -> def.getSpec().getConnectionSpecification())); + + final List sources = configPersistence.listConfigs(ConfigSchema.SOURCE_CONNECTION, SourceConnection.class); + + migrateSources(sources, definitionIdToSourceSpecs); + + final List standardDestinationDefinitions = + configPersistence.listConfigs(ConfigSchema.STANDARD_DESTINATION_DEFINITION, + StandardDestinationDefinition.class); + + final Map definitionIdToDestinationSpecs = standardDestinationDefinitions.stream() + .collect(Collectors.toMap(StandardDestinationDefinition::getDestinationDefinitionId, + def -> def.getSpec().getConnectionSpecification())); + + final List destinations = configPersistence.listConfigs(ConfigSchema.DESTINATION_CONNECTION, DestinationConnection.class); + + migrateDestinations(destinations, definitionIdToDestinationSpecs); + + jobPersistence.setSecretMigrationDone(); + } + + /** + * This is migrating the secrets for the source actors + */ + @VisibleForTesting + void migrateSources(final List sources, final Map definitionIdToSourceSpecs) + throws JsonValidationException, IOException { + log.info("Migrating Sources"); + final List sourceConnections = sources.stream() + .map(source -> { + final JsonNode migratedConfig = migrateConfiguration(new ConnectorConfiguration( + source.getWorkspaceId(), + source.getConfiguration(), + definitionIdToSourceSpecs.get(source.getSourceDefinitionId())), + () -> UUID.randomUUID()); + source.setConfiguration(migratedConfig); + return source; + }) + .toList(); + + for (final SourceConnection source : sourceConnections) { + configPersistence.writeConfig(ConfigSchema.SOURCE_CONNECTION, source.getSourceId().toString(), source); + } + } + + /** + * This is migrating the secrets for the destination actors + */ + @VisibleForTesting + void migrateDestinations(final List destinations, final Map definitionIdToDestinationSpecs) + throws JsonValidationException, IOException { + log.info("Migration Destinations"); + + final List destinationConnections = destinations.stream().map(destination -> { + final JsonNode migratedConfig = migrateConfiguration(new ConnectorConfiguration( + destination.getWorkspaceId(), + destination.getConfiguration(), + definitionIdToDestinationSpecs.get(destination.getDestinationDefinitionId())), + () -> UUID.randomUUID()); + destination.setConfiguration(migratedConfig); + return destination; + }) + .toList(); + for (final DestinationConnection destination : destinationConnections) { + configPersistence.writeConfig(ConfigSchema.DESTINATION_CONNECTION, destination.getDestinationId().toString(), destination); + } + } + + /** + * This is a generic method to migrate an actor configuration It will extract the secret path form + * the provided spec and then replace them by coordinates in the actor configuration + */ + @VisibleForTesting + JsonNode migrateConfiguration(final ConnectorConfiguration connectorConfiguration, final Supplier uuidProvider) { + if (connectorConfiguration.getSpec() == null) { + throw new IllegalStateException("No connector definition to match the connector"); + } + + final AtomicReference connectorConfigurationJson = new AtomicReference<>(connectorConfiguration.getConfiguration()); + final List uniqSecretPaths = getSecretPath(connectorConfiguration.getSpec()) + .stream() + .flatMap(secretPath -> getAllExplodedPath(connectorConfigurationJson.get(), secretPath).stream()) + .toList(); + + final UUID workspaceId = connectorConfiguration.getWorkspace(); + uniqSecretPaths.forEach(secretPath -> { + final Optional secretValue = getValueForPath(connectorConfigurationJson.get(), secretPath); + if (secretValue.isEmpty()) { + throw new IllegalStateException("Missing secret for the path: " + secretPath); + } + + // Only migrate plain text. + if (secretValue.get().isTextual()) { + final JsonNode stringSecretValue = secretValue.get(); + + final SecretCoordinate coordinate = + new SecretCoordinate(SecretsHelpers.getCoordinatorBase("airbyte_workspace_", workspaceId, uuidProvider), 1); + secretPersistence.get().write(coordinate, stringSecretValue.textValue()); + connectorConfigurationJson.set(replaceAtJsonNode(connectorConfigurationJson.get(), secretPath, + Jsons.jsonNode(Map.of(COORDINATE_FIELD, coordinate.getFullCoordinate())))); + } else { + log.error("Not migrating already migrated secrets"); + } + + }); + + return connectorConfigurationJson.get(); + } + + /** + * Wrapper to help to mock static methods + */ + @VisibleForTesting + JsonNode replaceAtJsonNode(final JsonNode connectorConfigurationJson, final String secretPath, final JsonNode replacement) { + return JsonPaths.replaceAtJsonNode(connectorConfigurationJson, secretPath, replacement); + } + + /** + * Wrapper to help to mock static methods + */ + @VisibleForTesting + List getSecretPath(final JsonNode specs) { + return SecretsHelpers.getSortedSecretPaths(specs); + } + + /** + * Wrapper to help to mock static methods + */ + @VisibleForTesting + List getAllExplodedPath(final JsonNode node, final String path) { + return JsonPaths.getPaths(node, path); + } + + /** + * Wrapper to help to mock static methods + */ + @VisibleForTesting + Optional getValueForPath(final JsonNode node, final String path) { + return JsonPaths.getSingleValue(node, path); + } + +} diff --git a/airbyte-bootloader/src/test/java/io/airbyte/bootloader/BootloaderAppTest.java b/airbyte-bootloader/src/test/java/io/airbyte/bootloader/BootloaderAppTest.java index c062764e89d9c..a6f0317e7f38d 100644 --- a/airbyte-bootloader/src/test/java/io/airbyte/bootloader/BootloaderAppTest.java +++ b/airbyte-bootloader/src/test/java/io/airbyte/bootloader/BootloaderAppTest.java @@ -1,27 +1,50 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.bootloader; +import static io.airbyte.config.Configs.SecretPersistenceType.TESTING_CONFIG_DB_TABLE; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.reset; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyNoInteractions; import static org.mockito.Mockito.when; +import com.fasterxml.jackson.databind.ObjectMapper; import io.airbyte.commons.features.FeatureFlags; import io.airbyte.commons.version.AirbyteVersion; import io.airbyte.config.Configs; -import io.airbyte.db.instance.configs.ConfigsDatabaseInstance; +import io.airbyte.config.SourceConnection; +import io.airbyte.config.StandardWorkspace; +import io.airbyte.config.init.YamlSeedConfigPersistence; +import io.airbyte.config.persistence.ConfigPersistence; +import io.airbyte.config.persistence.ConfigRepository; +import io.airbyte.config.persistence.DatabaseConfigPersistence; +import io.airbyte.config.persistence.split_secrets.JsonSecretsProcessor; +import io.airbyte.config.persistence.split_secrets.SecretPersistence; +import io.airbyte.db.factory.DSLContextFactory; +import io.airbyte.db.factory.DataSourceFactory; +import io.airbyte.db.factory.FlywayFactory; import io.airbyte.db.instance.configs.ConfigsDatabaseMigrator; -import io.airbyte.db.instance.jobs.JobsDatabaseInstance; +import io.airbyte.db.instance.configs.ConfigsDatabaseTestProvider; import io.airbyte.db.instance.jobs.JobsDatabaseMigrator; +import io.airbyte.db.instance.jobs.JobsDatabaseTestProvider; import io.airbyte.scheduler.persistence.DefaultJobPersistence; import java.util.Optional; +import java.util.UUID; import java.util.concurrent.atomic.AtomicBoolean; +import javax.sql.DataSource; import lombok.val; +import org.flywaydb.core.Flyway; +import org.jooq.SQLDialect; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.testcontainers.containers.PostgreSQLContainer; @@ -29,21 +52,45 @@ import uk.org.webcompere.systemstubs.jupiter.SystemStub; import uk.org.webcompere.systemstubs.jupiter.SystemStubsExtension; +@SuppressWarnings("PMD.AvoidUsingHardCodedIP") @ExtendWith(SystemStubsExtension.class) -public class BootloaderAppTest { +class BootloaderAppTest { + + private PostgreSQLContainer container; + private DataSource configsDataSource; + private DataSource jobsDataSource; + private static final String DOCKER = "docker"; + private static final String VERSION_0330_ALPHA = "0.33.0-alpha"; + private static final String VERSION_0320_ALPHA = "0.32.0-alpha"; + private static final String VERSION_0321_ALPHA = "0.32.1-alpha"; + private static final String VERSION_0170_ALPHA = "0.17.0-alpha"; + + @BeforeEach + void setup() { + container = new PostgreSQLContainer<>("postgres:13-alpine") + .withDatabaseName("public") + .withUsername(DOCKER) + .withPassword(DOCKER); + container.start(); + + configsDataSource = + DataSourceFactory.create(container.getUsername(), container.getPassword(), container.getDriverClassName(), container.getJdbcUrl()); + jobsDataSource = + DataSourceFactory.create(container.getUsername(), container.getPassword(), container.getDriverClassName(), container.getJdbcUrl()); + } + + @AfterEach + void cleanup() throws Exception { + closeDataSource(configsDataSource); + closeDataSource(jobsDataSource); + container.stop(); + } @SystemStub private EnvironmentVariables environmentVariables; @Test void testBootloaderAppBlankDb() throws Exception { - val container = new PostgreSQLContainer<>("postgres:13-alpine") - .withDatabaseName("public") - .withUsername("docker") - .withPassword("docker"); - container.start(); - val version = "0.33.0-alpha"; - val mockedConfigs = mock(Configs.class); when(mockedConfigs.getConfigDatabaseUrl()).thenReturn(container.getJdbcUrl()); when(mockedConfigs.getConfigDatabaseUser()).thenReturn(container.getUsername()); @@ -51,76 +98,195 @@ void testBootloaderAppBlankDb() throws Exception { when(mockedConfigs.getDatabaseUrl()).thenReturn(container.getJdbcUrl()); when(mockedConfigs.getDatabaseUser()).thenReturn(container.getUsername()); when(mockedConfigs.getDatabasePassword()).thenReturn(container.getPassword()); - when(mockedConfigs.getAirbyteVersion()).thenReturn(new AirbyteVersion(version)); + when(mockedConfigs.getAirbyteVersion()).thenReturn(new AirbyteVersion(VERSION_0330_ALPHA)); when(mockedConfigs.runDatabaseMigrationOnStartup()).thenReturn(true); + when(mockedConfigs.getConfigsDatabaseInitializationTimeoutMs()).thenReturn(60000L); + when(mockedConfigs.getJobsDatabaseInitializationTimeoutMs()).thenReturn(60000L); val mockedFeatureFlags = mock(FeatureFlags.class); - when(mockedFeatureFlags.usesNewScheduler()).thenReturn(false); + + val mockedSecretMigrator = mock(SecretMigrator.class); // Although we are able to inject mocked configs into the Bootloader, a particular migration in the // configs database // requires the env var to be set. Flyway prevents injection, so we dynamically set this instead. - environmentVariables.set("DATABASE_USER", "docker"); - environmentVariables.set("DATABASE_PASSWORD", "docker"); + environmentVariables.set("DATABASE_USER", DOCKER); + environmentVariables.set("DATABASE_PASSWORD", DOCKER); environmentVariables.set("DATABASE_URL", container.getJdbcUrl()); - val bootloader = new BootloaderApp(mockedConfigs, mockedFeatureFlags); - bootloader.load(); + try (val configsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES); + val jobsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES)) { + + val configsFlyway = createConfigsFlyway(configsDataSource); + val jobsFlyway = createJobsFlyway(jobsDataSource); + + val configDatabase = new ConfigsDatabaseTestProvider(configsDslContext, configsFlyway).create(false); + val jobDatabase = new JobsDatabaseTestProvider(jobsDslContext, jobsFlyway).create(false); + + val bootloader = + new BootloaderApp(mockedConfigs, mockedFeatureFlags, mockedSecretMigrator, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + bootloader.load(); + + val jobsMigrator = new JobsDatabaseMigrator(jobDatabase, jobsFlyway); + assertEquals("0.35.62.001", jobsMigrator.getLatestMigration().getVersion().getVersion()); + + val configsMigrator = new ConfigsDatabaseMigrator(configDatabase, configsFlyway); + // this line should change with every new migration + // to show that you meant to make a new migration to the prod database + assertEquals("0.39.17.001", configsMigrator.getLatestMigration().getVersion().getVersion()); + + val jobsPersistence = new DefaultJobPersistence(jobDatabase); + assertEquals(VERSION_0330_ALPHA, jobsPersistence.getVersion().get()); + + assertNotEquals(Optional.empty(), jobsPersistence.getDeployment().get()); + } + } + + @Test + void testBootloaderAppRunSecretMigration() throws Exception { + val mockedConfigs = mock(Configs.class); + when(mockedConfigs.getConfigDatabaseUrl()).thenReturn(container.getJdbcUrl()); + when(mockedConfigs.getConfigDatabaseUser()).thenReturn(container.getUsername()); + when(mockedConfigs.getConfigDatabasePassword()).thenReturn(container.getPassword()); + when(mockedConfigs.getDatabaseUrl()).thenReturn(container.getJdbcUrl()); + when(mockedConfigs.getDatabaseUser()).thenReturn(container.getUsername()); + when(mockedConfigs.getDatabasePassword()).thenReturn(container.getPassword()); + when(mockedConfigs.getAirbyteVersion()).thenReturn(new AirbyteVersion(VERSION_0330_ALPHA)); + when(mockedConfigs.runDatabaseMigrationOnStartup()).thenReturn(true); + when(mockedConfigs.getSecretPersistenceType()).thenReturn(TESTING_CONFIG_DB_TABLE); + when(mockedConfigs.getConfigsDatabaseInitializationTimeoutMs()).thenReturn(60000L); + when(mockedConfigs.getJobsDatabaseInitializationTimeoutMs()).thenReturn(60000L); + + val mockedFeatureFlags = mock(FeatureFlags.class); + + final JsonSecretsProcessor jsonSecretsProcessor = JsonSecretsProcessor.builder() + .copySecrets(true) + .maskSecrets(true) + .build(); + + try (val configsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES); + val jobsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES)) { + + val configsFlyway = createConfigsFlyway(configsDataSource); + val jobsFlyway = createJobsFlyway(jobsDataSource); + + val configDatabase = new ConfigsDatabaseTestProvider(configsDslContext, configsFlyway).create(false); + val jobDatabase = new JobsDatabaseTestProvider(jobsDslContext, jobsFlyway).create(false); + + val configPersistence = new DatabaseConfigPersistence(configDatabase, jsonSecretsProcessor); + val jobsPersistence = new DefaultJobPersistence(jobDatabase); + + val spiedSecretMigrator = + spy(new SecretMigrator(configPersistence, jobsPersistence, SecretPersistence.getLongLived(configsDslContext, mockedConfigs))); + + // Although we are able to inject mocked configs into the Bootloader, a particular migration in the + // configs database requires the env var to be set. Flyway prevents injection, so we dynamically set + // this instead. + environmentVariables.set("DATABASE_USER", DOCKER); + environmentVariables.set("DATABASE_PASSWORD", DOCKER); + environmentVariables.set("DATABASE_URL", container.getJdbcUrl()); + + // Bootstrap the database for the test + val initBootloader = new BootloaderApp(mockedConfigs, mockedFeatureFlags, null, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + initBootloader.load(); + + final ConfigPersistence localSchema = new YamlSeedConfigPersistence(YamlSeedConfigPersistence.DEFAULT_SEED_DEFINITION_RESOURCE_CLASS); + final ConfigRepository configRepository = new ConfigRepository(configPersistence, configDatabase); + configRepository.loadDataNoSecrets(localSchema); + + final String sourceSpecs = """ + { + "account_id": "1234567891234567", + "start_date": "2022-04-01T00:00:00Z", + "access_token": "nonhiddensecret", + "include_deleted": false, + "fetch_thumbnail_images": false + } + + """; + + final ObjectMapper mapper = new ObjectMapper(); + + final UUID workspaceId = UUID.randomUUID(); + configRepository.writeStandardWorkspace(new StandardWorkspace() + .withWorkspaceId(workspaceId) + .withName("wName") + .withSlug("wSlug") + .withEmail("email@mail.com") + .withTombstone(false) + .withInitialSetupComplete(false)); + final UUID sourceId = UUID.randomUUID(); + configRepository.writeSourceConnectionNoSecrets(new SourceConnection() + .withSourceDefinitionId(UUID.fromString("e7778cfc-e97c-4458-9ecb-b4f2bba8946c")) // Facebook Marketing + .withSourceId(sourceId) + .withName("test source") + .withWorkspaceId(workspaceId) + .withConfiguration(mapper.readTree(sourceSpecs))); + + when(mockedFeatureFlags.forceSecretMigration()).thenReturn(false); + + // Perform secrets migration + var bootloader = + new BootloaderApp(mockedConfigs, mockedFeatureFlags, spiedSecretMigrator, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + boolean isMigrated = jobsPersistence.isSecretMigrated(); + + assertFalse(isMigrated); - val jobDatabase = new JobsDatabaseInstance( - container.getUsername(), - container.getPassword(), - container.getJdbcUrl()).getInitialized(); - val jobsMigrator = new JobsDatabaseMigrator(jobDatabase, this.getClass().getName()); - assertEquals("0.35.5.001", jobsMigrator.getLatestMigration().getVersion().getVersion()); + bootloader.load(); + verify(spiedSecretMigrator).migrateSecrets(); - val configDatabase = new ConfigsDatabaseInstance( - mockedConfigs.getConfigDatabaseUser(), - mockedConfigs.getConfigDatabasePassword(), - mockedConfigs.getConfigDatabaseUrl()) - .getAndInitialize(); - val configsMigrator = new ConfigsDatabaseMigrator(configDatabase, this.getClass().getName()); - assertEquals("0.35.1.001", configsMigrator.getLatestMigration().getVersion().getVersion()); + final SourceConnection sourceConnection = configRepository.getSourceConnection(sourceId); - val jobsPersistence = new DefaultJobPersistence(jobDatabase); - assertEquals(version, jobsPersistence.getVersion().get()); + assertFalse(sourceConnection.getConfiguration().toString().contains("nonhiddensecret")); + assertTrue(sourceConnection.getConfiguration().toString().contains("_secret")); - assertNotEquals(Optional.empty(), jobsPersistence.getDeployment().get()); + isMigrated = jobsPersistence.isSecretMigrated(); + assertTrue(isMigrated); + + // Verify that the migration does not happen if it has already been performed + reset(spiedSecretMigrator); + // We need to re-create the bootloader because it is closing the persistence after running load + bootloader = + new BootloaderApp(mockedConfigs, mockedFeatureFlags, spiedSecretMigrator, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + bootloader.load(); + verifyNoInteractions(spiedSecretMigrator); + + // Verify that the migration occurs if the force migration feature flag is enabled + reset(spiedSecretMigrator); + when(mockedFeatureFlags.forceSecretMigration()).thenReturn(true); + // We need to re-create the bootloader because it is closing the persistence after running load + bootloader = + new BootloaderApp(mockedConfigs, mockedFeatureFlags, spiedSecretMigrator, configsDslContext, jobsDslContext, configsFlyway, jobsFlyway); + bootloader.load(); + verify(spiedSecretMigrator).migrateSecrets(); + } } @Test void testIsLegalUpgradePredicate() { // starting from no previous version is always legal. assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion("0.17.1-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion("0.32.0-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion("0.32.1-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion(VERSION_0320_ALPHA))); + assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion(VERSION_0321_ALPHA))); assertTrue(BootloaderApp.isLegalUpgrade(null, new AirbyteVersion("0.33.1-alpha"))); // starting from a version that is pre-breaking migration cannot go past the breaking migration. - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.17.0-alpha"), new AirbyteVersion("0.17.1-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.17.0-alpha"), new AirbyteVersion("0.18.0-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.17.0-alpha"), new AirbyteVersion("0.32.0-alpha"))); - assertFalse(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.17.0-alpha"), new AirbyteVersion("0.32.1-alpha"))); - assertFalse(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.17.0-alpha"), new AirbyteVersion("0.33.0-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0170_ALPHA), new AirbyteVersion("0.17.1-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0170_ALPHA), new AirbyteVersion("0.18.0-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0170_ALPHA), new AirbyteVersion(VERSION_0320_ALPHA))); + assertFalse(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0170_ALPHA), new AirbyteVersion(VERSION_0321_ALPHA))); + assertFalse(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0170_ALPHA), new AirbyteVersion(VERSION_0330_ALPHA))); // any migration starting at the breaking migration or after it can upgrade to anything. - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.32.0-alpha"), new AirbyteVersion("0.32.1-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.32.0-alpha"), new AirbyteVersion("0.33.0-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.32.1-alpha"), new AirbyteVersion("0.32.1-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.32.1-alpha"), new AirbyteVersion("0.33.0-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.33.0-alpha"), new AirbyteVersion("0.33.1-alpha"))); - assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion("0.33.0-alpha"), new AirbyteVersion("0.34.0-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0320_ALPHA), new AirbyteVersion(VERSION_0321_ALPHA))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0320_ALPHA), new AirbyteVersion(VERSION_0330_ALPHA))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0321_ALPHA), new AirbyteVersion(VERSION_0321_ALPHA))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0321_ALPHA), new AirbyteVersion(VERSION_0330_ALPHA))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0330_ALPHA), new AirbyteVersion("0.33.1-alpha"))); + assertTrue(BootloaderApp.isLegalUpgrade(new AirbyteVersion(VERSION_0330_ALPHA), new AirbyteVersion("0.34.0-alpha"))); } @Test void testPostLoadExecutionExecutes() throws Exception { - var testTriggered = new AtomicBoolean(); - - val container = new PostgreSQLContainer<>("postgres:13-alpine") - .withDatabaseName("public") - .withUsername("docker") - .withPassword("docker"); - container.start(); - val version = "0.33.0-alpha"; - + final var testTriggered = new AtomicBoolean(); val mockedConfigs = mock(Configs.class); when(mockedConfigs.getConfigDatabaseUrl()).thenReturn(container.getJdbcUrl()); when(mockedConfigs.getConfigDatabaseUser()).thenReturn(container.getUsername()); @@ -128,15 +294,44 @@ void testPostLoadExecutionExecutes() throws Exception { when(mockedConfigs.getDatabaseUrl()).thenReturn(container.getJdbcUrl()); when(mockedConfigs.getDatabaseUser()).thenReturn(container.getUsername()); when(mockedConfigs.getDatabasePassword()).thenReturn(container.getPassword()); - when(mockedConfigs.getAirbyteVersion()).thenReturn(new AirbyteVersion(version)); + when(mockedConfigs.getAirbyteVersion()).thenReturn(new AirbyteVersion(VERSION_0330_ALPHA)); when(mockedConfigs.runDatabaseMigrationOnStartup()).thenReturn(true); + when(mockedConfigs.getConfigsDatabaseInitializationTimeoutMs()).thenReturn(60000L); + when(mockedConfigs.getJobsDatabaseInitializationTimeoutMs()).thenReturn(60000L); val mockedFeatureFlags = mock(FeatureFlags.class); - when(mockedFeatureFlags.usesNewScheduler()).thenReturn(false); - new BootloaderApp(mockedConfigs, () -> testTriggered.set(true), mockedFeatureFlags).load(); + val mockedSecretMigrator = mock(SecretMigrator.class); + + try (val configsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES); + val jobsDslContext = DSLContextFactory.create(configsDataSource, SQLDialect.POSTGRES)) { + + val configsFlyway = createConfigsFlyway(configsDataSource); + val jobsFlyway = createJobsFlyway(jobsDataSource); + + new ConfigsDatabaseTestProvider(configsDslContext, configsFlyway).create(false); + new JobsDatabaseTestProvider(jobsDslContext, jobsFlyway).create(false); + + new BootloaderApp(mockedConfigs, () -> testTriggered.set(true), mockedFeatureFlags, mockedSecretMigrator, configsDslContext, jobsDslContext, + configsFlyway, jobsFlyway) + .load(); + + assertTrue(testTriggered.get()); + } + } + + private Flyway createConfigsFlyway(final DataSource dataSource) { + return FlywayFactory.create(dataSource, getClass().getName(), ConfigsDatabaseMigrator.DB_IDENTIFIER, + ConfigsDatabaseMigrator.MIGRATION_FILE_LOCATION); + } + + private Flyway createJobsFlyway(final DataSource dataSource) { + return FlywayFactory.create(dataSource, getClass().getName(), JobsDatabaseMigrator.DB_IDENTIFIER, + JobsDatabaseMigrator.MIGRATION_FILE_LOCATION); + } - assertTrue(testTriggered.get()); + private void closeDataSource(final DataSource dataSource) throws Exception { + DataSourceFactory.close(dataSource); } } diff --git a/airbyte-bootloader/src/test/java/io/airbyte/bootloader/SecretMigratorTest.java b/airbyte-bootloader/src/test/java/io/airbyte/bootloader/SecretMigratorTest.java new file mode 100644 index 0000000000000..2557031ccc5fc --- /dev/null +++ b/airbyte-bootloader/src/test/java/io/airbyte/bootloader/SecretMigratorTest.java @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.bootloader; + +import com.fasterxml.jackson.databind.JsonNode; +import com.google.common.collect.Lists; +import io.airbyte.bootloader.SecretMigrator.ConnectorConfiguration; +import io.airbyte.commons.json.Jsons; +import io.airbyte.config.ConfigSchema; +import io.airbyte.config.DestinationConnection; +import io.airbyte.config.SourceConnection; +import io.airbyte.config.StandardDestinationDefinition; +import io.airbyte.config.StandardSourceDefinition; +import io.airbyte.config.persistence.ConfigPersistence; +import io.airbyte.config.persistence.split_secrets.SecretCoordinate; +import io.airbyte.config.persistence.split_secrets.SecretPersistence; +import io.airbyte.protocol.models.ConnectorSpecification; +import io.airbyte.scheduler.persistence.JobPersistence; +import io.airbyte.validation.json.JsonValidationException; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.UUID; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class SecretMigratorTest { + + private final UUID workspaceId = UUID.randomUUID(); + + @Mock + private ConfigPersistence configPersistence; + + @Mock + private SecretPersistence secretPersistence; + + @Mock + private JobPersistence jobPersistence; + + private SecretMigrator secretMigrator; + + @BeforeEach + void setup() { + secretMigrator = Mockito.spy(new SecretMigrator(configPersistence, jobPersistence, Optional.of(secretPersistence))); + } + + @Test + void testMigrateSecret() throws JsonValidationException, IOException { + final JsonNode sourceSpec = Jsons.jsonNode("sourceSpec"); + final UUID sourceDefinitionId = UUID.randomUUID(); + final StandardSourceDefinition standardSourceDefinition = new StandardSourceDefinition() + .withSourceDefinitionId(sourceDefinitionId) + .withSpec( + new ConnectorSpecification() + .withConnectionSpecification(sourceSpec)); + final Map standardSourceDefinitions = new HashMap<>(); + standardSourceDefinitions.put(sourceDefinitionId, standardSourceDefinition.getSpec().getConnectionSpecification()); + Mockito.when(configPersistence.listConfigs(ConfigSchema.STANDARD_SOURCE_DEFINITION, StandardSourceDefinition.class)) + .thenReturn(Lists.newArrayList(standardSourceDefinition)); + + final JsonNode sourceConfiguration = Jsons.jsonNode("sourceConfiguration"); + final SourceConnection sourceConnection = new SourceConnection() + .withSourceId(UUID.randomUUID()) + .withSourceDefinitionId(sourceDefinitionId) + .withConfiguration(sourceConfiguration) + .withWorkspaceId(workspaceId); + final List sourceConnections = Lists.newArrayList(sourceConnection); + Mockito.when(configPersistence.listConfigs(ConfigSchema.SOURCE_CONNECTION, SourceConnection.class)) + .thenReturn(sourceConnections); + + final JsonNode destinationSpec = Jsons.jsonNode("destinationSpec"); + final UUID destinationDefinitionId = UUID.randomUUID(); + final StandardDestinationDefinition standardDestinationDefinition = new StandardDestinationDefinition() + .withDestinationDefinitionId(destinationDefinitionId) + .withSpec( + new ConnectorSpecification() + .withConnectionSpecification(destinationSpec)); + final Map standardDestinationDefinitions = new HashMap<>(); + standardDestinationDefinitions.put(destinationDefinitionId, standardDestinationDefinition.getSpec().getConnectionSpecification()); + Mockito.when(configPersistence.listConfigs(ConfigSchema.STANDARD_DESTINATION_DEFINITION, StandardDestinationDefinition.class)) + .thenReturn(Lists.newArrayList(standardDestinationDefinition)); + + final JsonNode destinationConfiguration = Jsons.jsonNode("destinationConfiguration"); + final DestinationConnection destinationConnection = new DestinationConnection() + .withDestinationId(UUID.randomUUID()) + .withDestinationDefinitionId(destinationDefinitionId) + .withConfiguration(destinationConfiguration) + .withWorkspaceId(workspaceId); + final List destinationConnections = Lists.newArrayList(destinationConnection); + Mockito.when(configPersistence.listConfigs(ConfigSchema.DESTINATION_CONNECTION, DestinationConnection.class)) + .thenReturn(destinationConnections); + + // Mockito.doNothing().when(secretMigrator).migrateDestinations(Mockito.any(), Mockito.any()); + + final String path = "Mocked static call source"; + Mockito.doReturn(Lists.newArrayList(path)).when(secretMigrator).getSecretPath(sourceSpec); + Mockito.doReturn(Lists.newArrayList(path)).when(secretMigrator).getAllExplodedPath(sourceConfiguration, path); + final String sourceSecret = "sourceSecret"; + Mockito.doReturn(Optional.of(Jsons.jsonNode(sourceSecret))).when(secretMigrator).getValueForPath(sourceConfiguration, path); + Mockito.doReturn(Lists.newArrayList(path)).when(secretMigrator).getSecretPath(destinationSpec); + Mockito.doReturn(Lists.newArrayList(path)).when(secretMigrator).getAllExplodedPath(destinationConfiguration, path); + final String destinationSecret = "destinationSecret"; + Mockito.doReturn(Optional.of(Jsons.jsonNode(destinationSecret))).when(secretMigrator).getValueForPath(destinationConfiguration, path); + + Mockito.doReturn(Jsons.jsonNode("sanitized")).when(secretMigrator).replaceAtJsonNode(Mockito.any(), Mockito.any(), Mockito.any()); + secretMigrator.migrateSecrets(); + + Mockito.verify(secretMigrator).migrateSources(sourceConnections, standardSourceDefinitions); + Mockito.verify(secretPersistence).write(Mockito.any(), Mockito.eq(sourceSecret)); + secretPersistence.write(Mockito.any(), Mockito.any()); + Mockito.verify(secretMigrator).migrateDestinations(destinationConnections, standardDestinationDefinitions); + Mockito.verify(secretPersistence).write(Mockito.any(), Mockito.eq(destinationSecret)); + + Mockito.verify(jobPersistence).setSecretMigrationDone(); + } + + @Test + void testSourceMigration() throws JsonValidationException, IOException { + final UUID definitionId1 = UUID.randomUUID(); + final UUID definitionId2 = UUID.randomUUID(); + final UUID sourceId1 = UUID.randomUUID(); + final UUID sourceId2 = UUID.randomUUID(); + final JsonNode sourceConfiguration1 = Jsons.jsonNode("conf1"); + final JsonNode sourceConfiguration2 = Jsons.jsonNode("conf2"); + final JsonNode sourceDefinition1 = Jsons.jsonNode("def1"); + final JsonNode sourceDefinition2 = Jsons.jsonNode("def2"); + final SourceConnection sourceConnection1 = new SourceConnection() + .withSourceId(sourceId1) + .withSourceDefinitionId(definitionId1) + .withConfiguration(sourceConfiguration1); + final SourceConnection sourceConnection2 = new SourceConnection() + .withSourceId(sourceId2) + .withSourceDefinitionId(definitionId2) + .withConfiguration(sourceConfiguration2); + + final List sources = Lists.newArrayList(sourceConnection1, sourceConnection2); + final Map definitionIdToDestinationSpecs = new HashMap<>(); + definitionIdToDestinationSpecs.put(definitionId1, sourceDefinition1); + definitionIdToDestinationSpecs.put(definitionId2, sourceDefinition2); + + Mockito.doReturn(Jsons.emptyObject()).when(secretMigrator).migrateConfiguration( + Mockito.any(), + Mockito.any()); + + secretMigrator.migrateSources(sources, definitionIdToDestinationSpecs); + + Mockito.verify(configPersistence).writeConfig(ConfigSchema.SOURCE_CONNECTION, sourceId1.toString(), sourceConnection1); + Mockito.verify(configPersistence).writeConfig(ConfigSchema.SOURCE_CONNECTION, sourceId2.toString(), sourceConnection2); + } + + @Test + void testDestinationMigration() throws JsonValidationException, IOException { + final UUID definitionId1 = UUID.randomUUID(); + final UUID definitionId2 = UUID.randomUUID(); + final UUID destinationId1 = UUID.randomUUID(); + final UUID destinationId2 = UUID.randomUUID(); + final JsonNode destinationConfiguration1 = Jsons.jsonNode("conf1"); + final JsonNode destinationConfiguration2 = Jsons.jsonNode("conf2"); + final JsonNode destinationDefinition1 = Jsons.jsonNode("def1"); + final JsonNode destinationDefinition2 = Jsons.jsonNode("def2"); + final DestinationConnection destinationConnection1 = new DestinationConnection() + .withDestinationId(destinationId1) + .withDestinationDefinitionId(definitionId1) + .withConfiguration(destinationConfiguration1); + final DestinationConnection destinationConnection2 = new DestinationConnection() + .withDestinationId(destinationId2) + .withDestinationDefinitionId(definitionId2) + .withConfiguration(destinationConfiguration2); + + final List destinations = Lists.newArrayList(destinationConnection1, destinationConnection2); + final Map definitionIdToDestinationSpecs = new HashMap<>(); + definitionIdToDestinationSpecs.put(definitionId1, destinationDefinition1); + definitionIdToDestinationSpecs.put(definitionId2, destinationDefinition2); + + Mockito.doReturn(Jsons.emptyObject()).when(secretMigrator).migrateConfiguration( + Mockito.any(), + Mockito.any()); + + secretMigrator.migrateDestinations(destinations, definitionIdToDestinationSpecs); + + Mockito.verify(configPersistence).writeConfig(ConfigSchema.DESTINATION_CONNECTION, destinationId1.toString(), destinationConnection1); + Mockito.verify(configPersistence).writeConfig(ConfigSchema.DESTINATION_CONNECTION, destinationId2.toString(), destinationConnection2); + } + + @Test + void testMigrateConfigurationWithoutSpecs() { + final ConnectorConfiguration connectorConfiguration = new ConnectorConfiguration(null, null, null); + + Assertions.assertThrows(IllegalStateException.class, () -> secretMigrator.migrateConfiguration(connectorConfiguration, null)); + } + + @Test + void testMissingSecret() { + final List secretPathList = Lists.newArrayList("secretPath"); + + Mockito.doReturn(secretPathList).when(secretMigrator).getSecretPath(Mockito.any()); + Mockito.doReturn(secretPathList).when(secretMigrator).getAllExplodedPath(Mockito.any(), Mockito.eq(secretPathList.get(0))); + Mockito.doReturn(Optional.empty()).when(secretMigrator).getValueForPath(Mockito.any(), Mockito.eq(secretPathList.get(0))); + + final ConnectorConfiguration connectorConfiguration = new ConnectorConfiguration(UUID.randomUUID(), Jsons.emptyObject(), Jsons.emptyObject()); + Assertions.assertThrows(IllegalStateException.class, () -> secretMigrator.migrateConfiguration(connectorConfiguration, () -> UUID.randomUUID())); + } + + @Test + void testMigrateConfiguration() { + final List secretPathList = Lists.newArrayList("$.secretPath"); + + Mockito.doReturn(secretPathList).when(secretMigrator).getSecretPath(Mockito.any()); + Mockito.doReturn(secretPathList).when(secretMigrator).getAllExplodedPath(Mockito.any(), Mockito.eq(secretPathList.get(0))); + Mockito.doReturn(Optional.of(Jsons.jsonNode(secretPathList.get(0)))).when(secretMigrator).getValueForPath(Mockito.any(), + Mockito.eq(secretPathList.get(0))); + + final ConnectorConfiguration connectorConfiguration = new ConnectorConfiguration(UUID.randomUUID(), Jsons.emptyObject(), Jsons.emptyObject()); + + secretMigrator.migrateConfiguration(connectorConfiguration, () -> UUID.randomUUID()); + Mockito.verify(secretPersistence).write(Mockito.any(), Mockito.any()); + } + + @Test + void testMigrateConfigurationAlreadyInSecretManager() { + final List secretPathList = Lists.newArrayList("$.secretPath"); + + Mockito.doReturn(secretPathList).when(secretMigrator).getSecretPath(Mockito.any()); + Mockito.doReturn(secretPathList).when(secretMigrator).getAllExplodedPath(Mockito.any(), Mockito.eq(secretPathList.get(0))); + + final SecretCoordinate fakeCoordinate = new SecretCoordinate("fake", 1); + Mockito.doReturn(Optional.of(Jsons.jsonNode(fakeCoordinate))).when(secretMigrator).getValueForPath(Mockito.any(), + Mockito.eq(secretPathList.get(0))); + + final ConnectorConfiguration connectorConfiguration = new ConnectorConfiguration(UUID.randomUUID(), Jsons.emptyObject(), Jsons.emptyObject()); + + secretMigrator.migrateConfiguration(connectorConfiguration, () -> UUID.randomUUID()); + Mockito.verify(secretPersistence, Mockito.times(0)).write(Mockito.any(), Mockito.any()); + } + +} diff --git a/airbyte-cdk/python/CHANGELOG.md b/airbyte-cdk/python/CHANGELOG.md index 4c4f0cf604aae..7f0fa5574a952 100644 --- a/airbyte-cdk/python/CHANGELOG.md +++ b/airbyte-cdk/python/CHANGELOG.md @@ -1,5 +1,86 @@ # Changelog +## 0.1.72 +- Bugfix: Fix bug in DatetimeStreamSlicer's parsing method + +## 0.1.71 +- Refactor declarative package to dataclasses +- Bugfix: Requester header always converted to string +- Bugfix: Reset paginator state between stream slices +- Bugfix: Record selector handles single records + +## 0.1.70 +- Bugfix: DatetimeStreamSlicer cast interpolated result to string before converting to datetime +- Bugfix: Set stream slicer's request options in SimpleRetriever + +## 0.1.69 +- AbstractSource emits a state message when reading incremental even if there were no stream slices to process. + +## 0.1.68 +- Replace parse-time string interpolation with run-time interpolation in YAML-based sources + +## 0.1.67 +- Add support declarative token authenticator. + +## 0.1.66 +- Call init_uncaught_exception_handler from AirbyteEntrypoint.__init__ and Destination.run_cmd +- Add the ability to remove & add records in YAML-based sources + +## 0.1.65 +- Allow for detailed debug messages to be enabled using the --debug command. + +## 0.1.64 +- Add support for configurable oauth request payload and declarative oauth authenticator. + +## 0.1.63 +- Define `namespace` property on the `Stream` class inside `core.py`. + +## 0.1.62 +Bugfix: Correctly obfuscate nested secrets and secrets specified inside oneOf blocks inside the connector's spec. + +## 0.1.61 +- Remove legacy sentry code + +## 0.1.60 +- Add `requests.exceptions.ChunkedEncodingError` to transient errors so it could be retried + +## 0.1.59 +- Add `Stream.get_error_display_message()` to retrieve user-friendly messages from exceptions encountered while reading streams. +- Add default error error message retrieval logic for `HTTPStream`s following common API patterns. + +## 0.1.58 +`TypeTransformer.default_convert` catch `TypeError` + +## 0.1.57 +Update protocol models to support per-stream state: [#12829](https://github.com/airbytehq/airbyte/pull/12829). + +## 0.1.56 +- Update protocol models to include `AirbyteTraceMessage` +- Emit an `AirbyteTraceMessage` on uncaught exceptions +- Add `AirbyteTracedException` + +## 0.1.55 +Add support for reading the spec from a YAML file (`spec.yaml`) + +## 0.1.54 +- Add ability to import `IncrementalMixin` from `airbyte_cdk.sources.streams`. +- Bumped minimum supported Python version to 3.9. + +## 0.1.53 +Remove a false positive error logging during the send process. + +## 0.1.52 +Fix BaseBackoffException constructor + +## 0.1.50 +Improve logging for Error handling during send process. + +## 0.1.49 +Add support for streams with explicit state attribute. + +## 0.1.48 +Fix type annotations. + ## 0.1.47 Fix typing errors. diff --git a/airbyte-cdk/python/README.md b/airbyte-cdk/python/README.md index e848bf96264e7..5c5700141a91a 100644 --- a/airbyte-cdk/python/README.md +++ b/airbyte-cdk/python/README.md @@ -8,7 +8,7 @@ The Airbyte Python CDK is a framework for rapidly developing production-grade Ai The CDK provides an improved developer experience by providing basic implementation structure and abstracting away low-level glue boilerplate. -This document is a general introduction to the CDK. Readers should have basic familiarity with the [Airbyte Specification](https://docs.airbyte.io/architecture/airbyte-specification) before proceeding. +This document is a general introduction to the CDK. Readers should have basic familiarity with the [Airbyte Specification](https://docs.airbyte.io/architecture/airbyte-protocol) before proceeding. ## Getting Started @@ -49,7 +49,7 @@ See the [concepts docs](docs/concepts/) for a tour through what the API offers. ### First time setup -We assume `python` points to python >=3.7. +We assume `python` points to python >=3.9. Setup a virtual env: diff --git a/airbyte-cdk/python/airbyte_cdk/connector.py b/airbyte-cdk/python/airbyte_cdk/connector.py index f17c76ab5754e..bd47f188c9076 100644 --- a/airbyte-cdk/python/airbyte_cdk/connector.py +++ b/airbyte-cdk/python/airbyte_cdk/connector.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -8,11 +8,20 @@ import os import pkgutil from abc import ABC, abstractmethod -from typing import Any, Mapping, Optional +from typing import Any, Generic, Mapping, Optional, Protocol, TypeVar +import yaml from airbyte_cdk.models import AirbyteConnectionStatus, ConnectorSpecification +def load_optional_package_file(package: str, filename: str) -> Optional[bytes]: + """Gets a resource from a package, returning None if it does not exist""" + try: + return pkgutil.get_data(package, filename) + except FileNotFoundError: + return None + + class AirbyteSpec(object): @staticmethod def from_file(file_name: str): @@ -24,43 +33,74 @@ def __init__(self, spec_string): self.spec_string = spec_string -class Connector(ABC): +TConfig = TypeVar("TConfig", bound=Mapping[str, Any]) + + +class BaseConnector(ABC, Generic[TConfig]): # configure whether the `check_config_against_spec_or_exit()` needs to be called check_config_against_spec: bool = True - # can be overridden to change an input config - def configure(self, config: Mapping[str, Any], temp_dir: str) -> Mapping[str, Any]: + @abstractmethod + def configure(self, config: Mapping[str, Any], temp_dir: str) -> TConfig: """ Persist config in temporary directory to run the Source job """ - config_path = os.path.join(temp_dir, "config.json") - self.write_config(config, config_path) - return config @staticmethod - def read_config(config_path: str) -> Mapping[str, Any]: + def read_config(config_path: str) -> TConfig: with open(config_path, "r") as file: contents = file.read() return json.loads(contents) @staticmethod - def write_config(config: Mapping[str, Any], config_path: str): + def write_config(config: TConfig, config_path: str): with open(config_path, "w") as fh: fh.write(json.dumps(config)) def spec(self, logger: logging.Logger) -> ConnectorSpecification: """ Returns the spec for this integration. The spec is a JSON-Schema object describing the required configurations (e.g: username and password) - required to run this integration. + required to run this integration. By default, this will be loaded from a "spec.yaml" or a "spec.json" in the package root. """ - raw_spec: Optional[bytes] = pkgutil.get_data(self.__class__.__module__.split(".")[0], "spec.json") - if not raw_spec: - raise ValueError("Unable to find spec.json.") - return ConnectorSpecification.parse_obj(json.loads(raw_spec)) + + package = self.__class__.__module__.split(".")[0] + + yaml_spec = load_optional_package_file(package, "spec.yaml") + json_spec = load_optional_package_file(package, "spec.json") + + if yaml_spec and json_spec: + raise RuntimeError("Found multiple spec files in the package. Only one of spec.yaml or spec.json should be provided.") + + if yaml_spec: + spec_obj = yaml.load(yaml_spec, Loader=yaml.SafeLoader) + elif json_spec: + spec_obj = json.loads(json_spec) + else: + raise FileNotFoundError("Unable to find spec.yaml or spec.json in the package.") + + return ConnectorSpecification.parse_obj(spec_obj) @abstractmethod - def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: + def check(self, logger: logging.Logger, config: TConfig) -> AirbyteConnectionStatus: """ Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect to the Stripe API. """ + + +class _WriteConfigProtocol(Protocol): + @staticmethod + def write_config(config: Mapping[str, Any], config_path: str): + ... + + +class DefaultConnectorMixin: + # can be overridden to change an input config + def configure(self: _WriteConfigProtocol, config: Mapping[str, Any], temp_dir: str) -> Mapping[str, Any]: + config_path = os.path.join(temp_dir, "config.json") + self.write_config(config, config_path) + return config + + +class Connector(DefaultConnectorMixin, BaseConnector[Mapping[str, Any]], ABC): + ... diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/destination.py b/airbyte-cdk/python/airbyte_cdk/destinations/destination.py index b46123dd0ffc2..97bb381113bde 100644 --- a/airbyte-cdk/python/airbyte_cdk/destinations/destination.py +++ b/airbyte-cdk/python/airbyte_cdk/destinations/destination.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import argparse @@ -10,6 +10,7 @@ from typing import Any, Iterable, List, Mapping from airbyte_cdk.connector import Connector +from airbyte_cdk.exception_handler import init_uncaught_exception_handler from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit from pydantic import ValidationError @@ -83,6 +84,7 @@ def parse_args(self, args: List[str]) -> argparse.Namespace: return parsed_args def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]: + cmd = parsed_args.command if cmd not in self.VALID_CMDS: raise Exception(f"Unrecognized command: {cmd}") @@ -103,6 +105,7 @@ def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]: yield from self._run_write(config=config, configured_catalog_path=parsed_args.catalog, input_stream=wrapped_stdin) def run(self, args: List[str]): + init_uncaught_exception_handler(logger) parsed_args = self.parse_args(args) output_messages = self.run_cmd(parsed_args) for message in output_messages: diff --git a/airbyte-cdk/python/airbyte_cdk/entrypoint.py b/airbyte-cdk/python/airbyte_cdk/entrypoint.py index 493911b3293e5..7d57dc7e9e249 100644 --- a/airbyte-cdk/python/airbyte_cdk/entrypoint.py +++ b/airbyte-cdk/python/airbyte_cdk/entrypoint.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -9,21 +9,22 @@ import os.path import sys import tempfile -from typing import Any, Dict, Iterable, List +from typing import Iterable, List -from airbyte_cdk.logger import AirbyteLogFormatter, init_logger +from airbyte_cdk.exception_handler import init_uncaught_exception_handler +from airbyte_cdk.logger import init_logger from airbyte_cdk.models import AirbyteMessage, Status, Type from airbyte_cdk.models.airbyte_protocol import ConnectorSpecification from airbyte_cdk.sources import Source -from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit, get_secret_values, split_config -from airbyte_cdk.sources.utils.sentry import AirbyteSentry -from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets +from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit, split_config +from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets, update_secrets logger = init_logger("airbyte") class AirbyteEntrypoint(object): def __init__(self, source: Source): + init_uncaught_exception_handler(logger) self.source = source self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}") @@ -31,6 +32,7 @@ def __init__(self, source: Source): def parse_args(args: List[str]) -> argparse.Namespace: # set up parent parsers parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument("--debug", action="store_true", help="enables detailed debug logs related to the sync") main_parser = argparse.ArgumentParser() subparsers = main_parser.add_subparsers(title="commands", dest="command") @@ -61,23 +63,19 @@ def parse_args(args: List[str]) -> argparse.Namespace: return main_parser.parse_args(args) - def configure_sentry(self, spec_schema: Dict[str, Any], parsed_args: argparse.Namespace): - secret_values = [] - if "config" in parsed_args: - config = self.source.read_config(parsed_args.config) - secret_values = get_secret_values(spec_schema, config) - source_name = self.source.__module__.split(".")[0] - source_name = source_name.split("_", 1)[-1] - AirbyteSentry.init(source_tag=source_name, secret_values=secret_values) - def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: cmd = parsed_args.command if not cmd: raise Exception("No command passed") + if hasattr(parsed_args, "debug") and parsed_args.debug: + self.logger.setLevel(logging.DEBUG) + self.logger.debug("Debug logs enabled") + else: + self.logger.setLevel(logging.INFO) + # todo: add try catch for exceptions with different exit codes source_spec: ConnectorSpecification = self.source.spec(self.logger) - self.configure_sentry(source_spec.connectionSpecification, parsed_args) with tempfile.TemporaryDirectory() as temp_dir: if cmd == "spec": message = AirbyteMessage(type=Type.SPEC, spec=source_spec) @@ -88,16 +86,14 @@ def run(self, parsed_args: argparse.Namespace) -> Iterable[str]: # Now that we have the config, we can use it to get a list of ai airbyte_secrets # that we should filter in logging to avoid leaking secrets - config_secrets = get_secrets(self.source, config, self.logger) - AirbyteLogFormatter.update_secrets(config_secrets) + config_secrets = get_secrets(source_spec.connectionSpecification, config) + update_secrets(config_secrets) # Remove internal flags from config before validating so # jsonschema's additionalProperties flag wont fail the validation - config, internal_config = split_config(config) + connector_config, _ = split_config(config) if self.source.check_config_against_spec or cmd == "check": - check_config_against_spec_or_exit(config, source_spec) - # Put internal flags back to config dict - config.update(internal_config.dict()) + check_config_against_spec_or_exit(connector_config, source_spec) if cmd == "check": check_result = self.source.check(self.logger, config) diff --git a/airbyte-cdk/python/airbyte_cdk/exception_handler.py b/airbyte-cdk/python/airbyte_cdk/exception_handler.py new file mode 100644 index 0000000000000..b88390b337d7d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/exception_handler.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import logging +import sys + +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +def init_uncaught_exception_handler(logger: logging.Logger) -> None: + """ + Handles uncaught exceptions by emitting an AirbyteTraceMessage and making sure they are not + printed to the console without having secrets removed. + """ + + def hook_fn(exception_type, exception_value, traceback_): + # For developer ergonomics, we want to see the stack trace in the logs when we do a ctrl-c + if issubclass(exception_type, KeyboardInterrupt): + sys.__excepthook__(exception_type, exception_value, traceback_) + return + + logger.fatal(exception_value, exc_info=exception_value) + + # emit an AirbyteTraceMessage for any exception that gets to this spot + traced_exc = ( + exception_value + if issubclass(exception_type, AirbyteTracedException) + else AirbyteTracedException.from_exception(exception_value) + ) + + traced_exc.emit_message() + + sys.excepthook = hook_fn diff --git a/airbyte-cdk/python/airbyte_cdk/logger.py b/airbyte-cdk/python/airbyte_cdk/logger.py index 1cfb72175a620..26882fdd4d4f6 100644 --- a/airbyte-cdk/python/airbyte_cdk/logger.py +++ b/airbyte-cdk/python/airbyte_cdk/logger.py @@ -1,16 +1,16 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # +import json import logging import logging.config -import sys import traceback -from typing import List, Tuple +from typing import Tuple from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage - -TRACE_LEVEL_NUM = 5 +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets +from deprecated import deprecated LOGGING_CONFIG = { "version": 1, @@ -31,42 +31,17 @@ } -def init_unhandled_exception_output_filtering(logger: logging.Logger) -> None: - """ - Make sure unhandled exceptions are not printed to the console without passing through the Airbyte logger and having - secrets removed. - """ - - def hook_fn(exception_type, exception_value, traceback_): - # For developer ergonomics, we want to see the stack trace in the logs when we do a ctrl-c - if issubclass(exception_type, KeyboardInterrupt): - sys.__excepthook__(exception_type, exception_value, traceback_) - else: - logger.critical(exception_value, exc_info=exception_value) - - sys.excepthook = hook_fn - - def init_logger(name: str = None): """Initial set up of logger""" - logging.addLevelName(TRACE_LEVEL_NUM, "TRACE") logger = logging.getLogger(name) - logger.setLevel(TRACE_LEVEL_NUM) + logger.setLevel(logging.INFO) logging.config.dictConfig(LOGGING_CONFIG) - init_unhandled_exception_output_filtering(logger) return logger class AirbyteLogFormatter(logging.Formatter): """Output log records using AirbyteMessage""" - _secrets: List[str] = [] - - @classmethod - def update_secrets(cls, secrets: List[str]): - """Update the list of secrets to be replaced in the log message""" - cls._secrets = secrets - # Transforming Python log levels to Airbyte protocol log levels level_mapping = { logging.FATAL: "FATAL", @@ -74,17 +49,30 @@ def update_secrets(cls, secrets: List[str]): logging.WARNING: "WARN", logging.INFO: "INFO", logging.DEBUG: "DEBUG", - TRACE_LEVEL_NUM: "TRACE", } def format(self, record: logging.LogRecord) -> str: """Return a JSON representation of the log message""" - message = super().format(record) airbyte_level = self.level_mapping.get(record.levelno, "INFO") - for secret in AirbyteLogFormatter._secrets: - message = message.replace(secret, "****") - log_message = AirbyteMessage(type="LOG", log=AirbyteLogMessage(level=airbyte_level, message=message)) - return log_message.json(exclude_unset=True) + if airbyte_level == "DEBUG": + extras = self.extract_extra_args_from_record(record) + debug_dict = {"type": "DEBUG", "message": record.getMessage(), "data": extras} + return filter_secrets(json.dumps(debug_dict)) + else: + message = super().format(record) + message = filter_secrets(message) + log_message = AirbyteMessage(type="LOG", log=AirbyteLogMessage(level=airbyte_level, message=message)) + return log_message.json(exclude_unset=True) + + @staticmethod + def extract_extra_args_from_record(record: logging.LogRecord): + """ + The python logger conflates default args with extra args. We use an empty log record and set operations + to isolate fields passed to the log record via extra by the developer. + """ + default_attrs = logging.LogRecord("", 0, "", 0, None, None, None).__dict__.keys() + extra_keys = set(record.__dict__.keys()) - default_attrs + return {k: str(getattr(record, k)) for k in extra_keys if hasattr(record, k)} def log_by_prefix(msg: str, default_level: str) -> Tuple[int, str]: @@ -102,6 +90,7 @@ def log_by_prefix(msg: str, default_level: str) -> Tuple[int, str]: return log_level, rendered_message +@deprecated(version="0.1.47", reason="Use logging.getLogger('airbyte') instead") class AirbyteLogger: def log(self, level, message): log_record = AirbyteLogMessage(level=level, message=message) diff --git a/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py index 39680a330c04a..b412feba27798 100644 --- a/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py +++ b/airbyte-cdk/python/airbyte_cdk/models/airbyte_protocol.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # # generated by datamodel-codegen: @@ -20,26 +20,41 @@ class Type(Enum): SPEC = "SPEC" CONNECTION_STATUS = "CONNECTION_STATUS" CATALOG = "CATALOG" + TRACE = "TRACE" class AirbyteRecordMessage(BaseModel): class Config: extra = Extra.allow - stream: str = Field(..., description="the name of this record's stream") - data: Dict[str, Any] = Field(..., description="the record data") + namespace: Optional[str] = Field(None, description="namespace the data is associated with") + stream: str = Field(..., description="stream the data is associated with") + data: Dict[str, Any] = Field(..., description="record data") emitted_at: int = Field( ..., description="when the data was emitted from the source. epoch in millisecond.", ) - namespace: Optional[str] = Field(None, description="the namespace of this record's stream") -class AirbyteStateMessage(BaseModel): +class AirbyteStateType(Enum): + GLOBAL = "GLOBAL" + STREAM = "STREAM" + LEGACY = "LEGACY" + + +class StreamDescriptor(BaseModel): class Config: extra = Extra.allow - data: Dict[str, Any] = Field(..., description="the state data") + name: str + namespace: Optional[str] = None + + +class AirbyteStateBlob(BaseModel): + pass + + class Config: + extra = Extra.allow class Level(Enum): @@ -55,8 +70,27 @@ class AirbyteLogMessage(BaseModel): class Config: extra = Extra.allow - level: Level = Field(..., description="the type of logging") - message: str = Field(..., description="the log message") + level: Level = Field(..., description="log level") + message: str = Field(..., description="log message") + + +class TraceType(Enum): + ERROR = "ERROR" + + +class FailureType(Enum): + system_error = "system_error" + config_error = "config_error" + + +class AirbyteErrorTraceMessage(BaseModel): + class Config: + extra = Extra.allow + + message: str = Field(..., description="A user-friendly message that indicates the cause of the error") + internal_message: Optional[str] = Field(None, description="The internal error that caused the failure") + stack_trace: Optional[str] = Field(None, description="The full stack trace of the error") + failure_type: Optional[FailureType] = Field(None, description="The type of error") class Status(Enum): @@ -119,6 +153,9 @@ class AuthFlowType(Enum): class OAuthConfigSpecification(BaseModel): + class Config: + extra = Extra.allow + oauth_user_input_from_connector_config_specification: Optional[Dict[str, Any]] = Field( None, description="OAuth specific blob. This is a Json Schema used to validate Json configurations used as input to OAuth.\nMust be a valid non-nested JSON that refers to properties from ConnectorSpecification.connectionSpecification\nusing special annotation 'path_in_connector_config'.\nThese are input values the user is entering through the UI to authenticate to the connector, that might also shared\nas inputs for syncing data via the connector.\n\nExamples:\n\nif no connector values is shared during oauth flow, oauth_user_input_from_connector_config_specification=[]\nif connector values such as 'app_id' inside the top level are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['app_id']\n }\n }\nif connector values such as 'info.app_id' nested inside another object are used to generate the API url for the oauth flow,\n oauth_user_input_from_connector_config_specification={\n app_id: {\n type: string\n path_in_connector_config: ['info', 'app_id']\n }\n }", @@ -137,6 +174,31 @@ class OAuthConfigSpecification(BaseModel): ) +class AirbyteStreamState(BaseModel): + class Config: + extra = Extra.allow + + stream_descriptor: StreamDescriptor + stream_state: Optional[AirbyteStateBlob] = None + + +class AirbyteGlobalState(BaseModel): + class Config: + extra = Extra.allow + + shared_state: Optional[AirbyteStateBlob] = None + stream_states: List[AirbyteStreamState] + + +class AirbyteTraceMessage(BaseModel): + class Config: + extra = Extra.allow + + type: TraceType = Field(..., description="the type of trace message", title="trace type") + emitted_at: float = Field(..., description="the time in ms that the message was emitted") + error: Optional[AirbyteErrorTraceMessage] = Field(None, description="error trace message: the error object") + + class AirbyteStream(BaseModel): class Config: extra = Extra.allow @@ -202,7 +264,10 @@ class Config: ..., description="ConnectorDefinition specific blob. Must be a valid JSON string.", ) - supportsIncremental: Optional[bool] = Field(None, description="If the connector supports incremental mode or not.") + supportsIncremental: Optional[bool] = Field( + None, + description="(deprecated) If the connector supports incremental mode or not.", + ) supportsNormalization: Optional[bool] = Field(False, description="If the connector supports normalization or not.") supportsDBT: Optional[bool] = Field(False, description="If the connector supports DBT or not.") supported_destination_sync_modes: Optional[List[DestinationSyncMode]] = Field( @@ -215,6 +280,16 @@ class Config: ) +class AirbyteStateMessage(BaseModel): + class Config: + extra = Extra.allow + + type: Optional[AirbyteStateType] = None + stream: Optional[AirbyteStreamState] = None + global_: Optional[AirbyteGlobalState] = Field(None, alias="global") + data: Optional[Dict[str, Any]] = Field(None, description="(Deprecated) the state data") + + class AirbyteCatalog(BaseModel): class Config: extra = Extra.allow @@ -240,12 +315,16 @@ class Config: ) spec: Optional[ConnectorSpecification] = None connectionStatus: Optional[AirbyteConnectionStatus] = None - catalog: Optional[AirbyteCatalog] = Field(None, description="catalog message: the calalog") + catalog: Optional[AirbyteCatalog] = Field(None, description="catalog message: the catalog") record: Optional[AirbyteRecordMessage] = Field(None, description="record message: the record") state: Optional[AirbyteStateMessage] = Field( None, description="schema message: the state. Must be the last message produced. The platform uses this information", ) + trace: Optional[AirbyteTraceMessage] = Field( + None, + description="trace message: a message to communicate information about the status and performance of a connector", + ) class AirbyteProtocol(BaseModel): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py b/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py index d2e81e99b3505..e4585ae7c2fc4 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/abstract_source.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -28,6 +28,7 @@ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config from airbyte_cdk.sources.utils.transform import TypeTransformer from airbyte_cdk.utils.event_timing import create_timer +from airbyte_cdk.utils.traced_exception import AirbyteTracedException class AbstractSource(Source, ABC): @@ -52,7 +53,8 @@ def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> @abstractmethod def streams(self, config: Mapping[str, Any]) -> List[Stream]: """ - :param config: The user-provided configuration as specified by the source's spec. Any stream construction related operation should happen here. + :param config: The user-provided configuration as specified by the source's spec. + Any stream construction related operation should happen here. :return: A list of the streams in this source connector. """ @@ -65,12 +67,16 @@ def name(self) -> str: return self.__class__.__name__ def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: - """Implements the Discover operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification.""" + """Implements the Discover operation from the Airbyte Specification. + See https://docs.airbyte.io/architecture/airbyte-protocol. + """ streams = [stream.as_airbyte_stream() for stream in self.streams(config=config)] return AirbyteCatalog(streams=streams) def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteConnectionStatus: - """Implements the Check Connection operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification.""" + """Implements the Check Connection operation from the Airbyte Specification. + See https://docs.airbyte.io/architecture/airbyte-protocol. + """ try: check_succeeded, error = self.check_connection(logger, config) if not check_succeeded: @@ -81,9 +87,13 @@ def check(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCon return AirbyteConnectionStatus(status=Status.SUCCEEDED) def read( - self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None + self, + logger: logging.Logger, + config: Mapping[str, Any], + catalog: ConfiguredAirbyteCatalog, + state: MutableMapping[str, Any] = None, ) -> Iterator[AirbyteMessage]: - """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-specification.""" + """Implements the Read operation from the Airbyte Specification. See https://docs.airbyte.io/architecture/airbyte-protocol.""" connector_state = copy.deepcopy(state or {}) logger.info(f"Starting syncing {self.name}") config, internal_config = split_config(config) @@ -96,10 +106,11 @@ def read( stream_instance = stream_instances.get(configured_stream.stream.name) if not stream_instance: raise KeyError( - f"The requested stream {configured_stream.stream.name} was not found in the source. Available streams: {stream_instances.keys()}" + f"The requested stream {configured_stream.stream.name} was not found in the source." + f" Available streams: {stream_instances.keys()}" ) - try: + timer.start_event(f"Syncing stream {configured_stream.stream.name}") yield from self._read_stream( logger=logger, stream_instance=stream_instance, @@ -107,11 +118,17 @@ def read( connector_state=connector_state, internal_config=internal_config, ) + except AirbyteTracedException as e: + raise e except Exception as e: - logger.exception(f"Encountered an exception while reading stream {self.name}") + logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}") + display_message = stream_instance.get_error_display_message(e) + if display_message: + raise AirbyteTracedException.from_exception(e, message=display_message) from e raise e finally: - logger.info(f"Finished syncing {self.name}") + timer.finish_event() + logger.info(f"Finished syncing {configured_stream.stream.name}") logger.info(timer.report()) logger.info(f"Finished syncing {self.name}") @@ -124,16 +141,37 @@ def _read_stream( connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: - + self._apply_log_level_to_stream_logger(logger, stream_instance) if internal_config.page_size and isinstance(stream_instance, HttpStream): logger.info(f"Setting page size for {stream_instance.name} to {internal_config.page_size}") stream_instance.page_size = internal_config.page_size + logger.debug( + f"Syncing configured stream: {configured_stream.stream.name}", + extra={ + "sync_mode": configured_stream.sync_mode, + "primary_key": configured_stream.primary_key, + "cursor_field": configured_stream.cursor_field, + }, + ) + logger.debug( + f"Syncing stream instance: {stream_instance.name}", + extra={ + "primary_key": stream_instance.primary_key, + "cursor_field": stream_instance.cursor_field, + }, + ) use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental if use_incremental: - record_iterator = self._read_incremental(logger, stream_instance, configured_stream, connector_state, internal_config) + record_iterator = self._read_incremental( + logger, + stream_instance, + configured_stream, + connector_state, + internal_config, + ) else: - record_iterator = self._read_full_refresh(stream_instance, configured_stream, internal_config) + record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config) record_counter = 0 stream_name = configured_stream.stream.name @@ -166,19 +204,37 @@ def _read_incremental( connector_state: MutableMapping[str, Any], internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: + """Read stream using incremental algorithm + + :param logger: + :param stream_instance: + :param configured_stream: + :param connector_state: + :param internal_config: + :return: + """ stream_name = configured_stream.stream.name stream_state = connector_state.get(stream_name, {}) - if stream_state: + if stream_state and "state" in dir(stream_instance): + stream_instance.state = stream_state logger.info(f"Setting state of {stream_name} stream to {stream_state}") slices = stream_instance.stream_slices( - cursor_field=configured_stream.cursor_field, sync_mode=SyncMode.incremental, stream_state=stream_state + cursor_field=configured_stream.cursor_field, + sync_mode=SyncMode.incremental, + stream_state=stream_state, ) + logger.debug(f"Processing stream slices for {stream_name}", extra={"stream_slices": slices}) total_records_counter = 0 - for slice in slices: + if not slices: + # Safety net to ensure we always emit at least one state message even if there are no slices + checkpoint = self._checkpoint_state(stream_instance, stream_instance.state, connector_state) + yield checkpoint + for _slice in slices: + logger.debug("Processing stream slice", extra={"slice": _slice}) records = stream_instance.read_records( sync_mode=SyncMode.incremental, - stream_slice=slice, + stream_slice=_slice, stream_state=stream_state, cursor_field=configured_stream.cursor_field or None, ) @@ -187,7 +243,7 @@ def _read_incremental( stream_state = stream_instance.get_updated_state(stream_state, record_data) checkpoint_interval = stream_instance.state_checkpoint_interval if checkpoint_interval and record_counter % checkpoint_interval == 0: - yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) + yield self._checkpoint_state(stream_instance, stream_state, connector_state) total_records_counter += 1 # This functionality should ideally live outside of this method @@ -197,18 +253,26 @@ def _read_incremental( # Break from slice loop to save state and exit from _read_incremental function. break - yield self._checkpoint_state(stream_name, stream_state, connector_state, logger) + yield self._checkpoint_state(stream_instance, stream_state, connector_state) if self._limit_reached(internal_config, total_records_counter): return def _read_full_refresh( - self, stream_instance: Stream, configured_stream: ConfiguredAirbyteStream, internal_config: InternalConfig + self, + logger: logging.Logger, + stream_instance: Stream, + configured_stream: ConfiguredAirbyteStream, + internal_config: InternalConfig, ) -> Iterator[AirbyteMessage]: slices = stream_instance.stream_slices(sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field) + logger.debug(f"Processing stream slices for {configured_stream.stream.name}", extra={"stream_slices": slices}) total_records_counter = 0 - for slice in slices: + for _slice in slices: + logger.debug("Processing stream slice", extra={"slice": _slice}) records = stream_instance.read_records( - stream_slice=slice, sync_mode=SyncMode.full_refresh, cursor_field=configured_stream.cursor_field + stream_slice=_slice, + sync_mode=SyncMode.full_refresh, + cursor_field=configured_stream.cursor_field, ) for record in records: yield self._as_airbyte_record(configured_stream.stream.name, record) @@ -216,9 +280,12 @@ def _read_full_refresh( if self._limit_reached(internal_config, total_records_counter): return - def _checkpoint_state(self, stream_name, stream_state, connector_state, logger): - logger.info(f"Setting state of {stream_name} stream to {stream_state}") - connector_state[stream_name] = stream_state + def _checkpoint_state(self, stream, stream_state, connector_state): + try: + connector_state[stream.name] = stream.state + except AttributeError: + connector_state[stream.name] = stream_state + return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=connector_state)) @lru_cache(maxsize=None) @@ -243,3 +310,12 @@ def _as_airbyte_record(self, stream_name: str, data: Mapping[str, Any]): transformer.transform(data, schema) # type: ignore message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis) return AirbyteMessage(type=MessageType.RECORD, record=message) + + @staticmethod + def _apply_log_level_to_stream_logger(logger: logging.Logger, stream_instance: Stream): + """ + Necessary because we use different loggers at the source and stream levels. We must + apply the source's log level to each stream's logger. + """ + if hasattr(logger, "level"): + stream_instance.logger.setLevel(logger.level) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/config.py b/airbyte-cdk/python/airbyte_cdk/sources/config.py index b96672ef86e49..f3e0d0344da5e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/config.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/config.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # from typing import Any, Dict diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py new file mode 100644 index 0000000000000..eb3b84dde7203 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.auth.oauth import DeclarativeOauth2Authenticator + +__all__ = [ + "DeclarativeOauth2Authenticator", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py new file mode 100644 index 0000000000000..ff9d5ef8b104a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/oauth.py @@ -0,0 +1,99 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional, Union + +import pendulum +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth import AbstractOauth2Authenticator +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class DeclarativeOauth2Authenticator(AbstractOauth2Authenticator, JsonSchemaMixin): + """ + Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials based on + a declarative connector configuration file. Credentials can be defined explicitly or via interpolation + at runtime. The generated access token is attached to each request via the Authorization header. + + Attributes: + token_refresh_endpoint (Union[InterpolatedString, str]): The endpoint to refresh the access token + client_id (Union[InterpolatedString, str]): The client id + client_secret (Union[InterpolatedString, str]): Client secret + refresh_token (Union[InterpolatedString, str]): The token used to refresh the access token + access_token_name (Union[InterpolatedString, str]): THe field to extract access token from in the response + expires_in_name (Union[InterpolatedString, str]): The field to extract expires_in from in the response + config (Mapping[str, Any]): The user-provided configuration as specified by the source's spec + scopes (Optional[List[str]]): The scopes to request + token_expiry_date (Optional[Union[InterpolatedString, str]]): The access token expiration date + refresh_request_body (Optional[Mapping[str, Any]]): The request body to send in the refresh request + """ + + token_refresh_endpoint: Union[InterpolatedString, str] + client_id: Union[InterpolatedString, str] + client_secret: Union[InterpolatedString, str] + refresh_token: Union[InterpolatedString, str] + config: Mapping[str, Any] + options: InitVar[Mapping[str, Any]] + scopes: Optional[List[str]] = None + token_expiry_date: Optional[Union[InterpolatedString, str]] = None + _token_expiry_date: pendulum.DateTime = field(init=False, repr=False) + access_token_name: Union[InterpolatedString, str] = "access_token" + expires_in_name: Union[InterpolatedString, str] = "expires_in" + refresh_request_body: Optional[Mapping[str, Any]] = None + + def __post_init__(self, options: Mapping[str, Any]): + self.token_refresh_endpoint = InterpolatedString.create(self.token_refresh_endpoint, options=options) + self.client_id = InterpolatedString.create(self.client_id, options=options) + self.client_secret = InterpolatedString.create(self.client_secret, options=options) + self.refresh_token = InterpolatedString.create(self.refresh_token, options=options) + self.access_token_name = InterpolatedString.create(self.access_token_name, options=options) + self.expires_in_name = InterpolatedString.create(self.expires_in_name, options=options) + self._refresh_request_body = InterpolatedMapping(self.refresh_request_body or {}, options=options) + self._token_expiry_date = ( + pendulum.parse(InterpolatedString.create(self.token_expiry_date, options=options).eval(self.config)) + if self.token_expiry_date + else pendulum.now().subtract(days=1) + ) + self._access_token = None + + def get_token_refresh_endpoint(self) -> str: + return self.token_refresh_endpoint.eval(self.config) + + def get_client_id(self) -> str: + return self.client_id.eval(self.config) + + def get_client_secret(self) -> str: + return self.client_secret.eval(self.config) + + def get_refresh_token(self) -> str: + return self.refresh_token.eval(self.config) + + def get_scopes(self) -> [str]: + return self.scopes + + def get_access_token_name(self) -> InterpolatedString: + return self.access_token_name.eval(self.config) + + def get_expires_in_name(self) -> InterpolatedString: + return self.expires_in_name.eval(self.config) + + def get_refresh_request_body(self) -> Mapping[str, Any]: + return self._refresh_request_body.eval(self.config) + + def get_token_expiry_date(self) -> pendulum.DateTime: + return self._token_expiry_date + + def set_token_expiry_date(self, value: pendulum.DateTime): + self._token_expiry_date = value + + @property + def access_token(self) -> str: + return self._access_token + + @access_token.setter + def access_token(self, value: str): + self._access_token = value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py new file mode 100644 index 0000000000000..04790ae9e3036 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/auth/token.py @@ -0,0 +1,116 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import base64 +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.types import Config +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class ApiKeyAuthenticator(AbstractHeaderAuthenticator, JsonSchemaMixin): + """ + ApiKeyAuth sets a request header on the HTTP requests sent. + + The header is of the form: + `"

": ""` + + For example, + `ApiKeyAuthenticator("Authorization", "Bearer hello")` + will result in the following header set on the HTTP request + `"Authorization": "Bearer hello"` + + Attributes: + header (Union[InterpolatedString, str]): Header key to set on the HTTP requests + api_token (Union[InterpolatedString, str]): Header value to set on the HTTP requests + config (Config): The user-provided configuration as specified by the source's spec + options (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + header: Union[InterpolatedString, str] + api_token: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._header = InterpolatedString.create(self.header, options=options) + self._token = InterpolatedString.create(self.api_token, options=options) + + @property + def auth_header(self) -> str: + return self._header.eval(self.config) + + @property + def token(self) -> str: + return self._token.eval(self.config) + + +@dataclass +class BearerAuthenticator(AbstractHeaderAuthenticator, JsonSchemaMixin): + """ + Authenticator that sets the Authorization header on the HTTP requests sent. + + The header is of the form: + `"Authorization": "Bearer "` + + Attributes: + api_token (Union[InterpolatedString, str]): The bearer token + config (Config): The user-provided configuration as specified by the source's spec + options (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + api_token: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._token = InterpolatedString.create(self.api_token, options=options) + + @property + def auth_header(self) -> str: + return "Authorization" + + @property + def token(self) -> str: + return f"Bearer {self._token.eval(self.config)}" + + +@dataclass +class BasicHttpAuthenticator(AbstractHeaderAuthenticator): + """ + Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using bas64 + https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme + + The header is of the form + `"Authorization": "Basic "` + + Attributes: + username (Union[InterpolatedString, str]): The username + config (Config): The user-provided configuration as specified by the source's spec + password (Union[InterpolatedString, str]): The password + options (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + username: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + password: Union[InterpolatedString, str] = "" + + def __post_init__(self, options): + self._username = InterpolatedString.create(self.username, options=options) + self._password = InterpolatedString.create(self.password, options=options) + + @property + def auth_header(self) -> str: + return "Authorization" + + @property + def token(self) -> str: + auth_string = f"{self._username.eval(self.config)}:{self._password.eval(self.config)}".encode("utf8") + b64_encoded = base64.b64encode(auth_string).decode("utf8") + return f"Basic {b64_encoded}" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py new file mode 100644 index 0000000000000..fb6665d946251 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.checks.check_stream import CheckStream +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker + +__all__ = ["CheckStream", "ConnectionChecker"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py new file mode 100644 index 0000000000000..decf9fefc862a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/check_stream.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import logging +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Tuple + +from airbyte_cdk.models.airbyte_protocol import SyncMode +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.source import Source +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class CheckStream(ConnectionChecker, JsonSchemaMixin): + """ + Checks the connections by trying to read records from one or many of the streams selected by the developer + + Attributes: + stream_name (List[str]): name of streams to read records from + """ + + stream_names: List[str] + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._options = options + + def check_connection(self, source: Source, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, any]: + streams = source.streams(config) + stream_name_to_stream = {s.name: s for s in streams} + if len(streams) == 0: + return False, f"No streams to connect to from source {source}" + for stream_name in self.stream_names: + if stream_name in stream_name_to_stream.keys(): + stream = stream_name_to_stream[stream_name] + try: + records = stream.read_records(sync_mode=SyncMode.full_refresh) + next(records) + except Exception as error: + return False, f"Unable to connect to stream {stream} - {error}" + else: + raise ValueError(f"{stream_name} is not part of the catalog. Expected one of {stream_name_to_stream.keys()}") + return True, None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py new file mode 100644 index 0000000000000..6e76244e17172 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/checks/connection_checker.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import logging +from abc import ABC, abstractmethod +from typing import Any, Mapping, Tuple + +from airbyte_cdk.sources.source import Source + + +class ConnectionChecker(ABC): + """ + Abstract base class for checking a connection + """ + + @abstractmethod + def check_connection(self, source: Source, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, any]: + """ + Tests if the input configuration can be used to successfully connect to the integration e.g: if a provided Stripe API token can be used to connect + to the Stripe API. + + :param source: source + :param logger: source logger + :param config: The user-provided configuration as specified by the source's spec. + This usually contains information required to check connection e.g. tokens, secrets and keys etc. + :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful + and we can connect to the underlying data source using the provided configuration. + Otherwise, the input config cannot be used to connect to the underlying data source, + and the "error" object should describe what went wrong. + The error object will be cast to string to display the problem to the user. + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/create_partial.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/create_partial.py new file mode 100644 index 0000000000000..c941153f3f84d --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/create_partial.py @@ -0,0 +1,84 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import inspect + +OPTIONS_STR = "$options" + + +def create(func, /, *args, **keywords): + """ + Create a partial on steroids. + Returns a partial object which when called will behave like func called with the arguments supplied. + Parameters will be interpolated before the creation of the object + The interpolation will take in kwargs, and config as parameters that can be accessed through interpolating. + If any of the parameters are also create functions, they will also be created. + kwargs are propagated to the recursive method calls + + :param func: Function + :param args: + :param keywords: + :return: partially created object + """ + + def newfunc(*fargs, **fkeywords): + all_keywords = {**keywords} + all_keywords.update(fkeywords) + + # config is a special keyword used for interpolation + config = all_keywords.pop("config", None) + + # $options is a special keyword used for interpolation and propagation + if OPTIONS_STR in all_keywords: + options = all_keywords.get(OPTIONS_STR) + else: + options = dict() + + # if config is not none, add it back to the keywords mapping + if config is not None: + all_keywords["config"] = config + + kwargs_to_pass_down = _get_kwargs_to_pass_to_func(func, options) + all_keywords_to_pass_down = _get_kwargs_to_pass_to_func(func, all_keywords) + + # options is required as part of creation of all declarative components + dynamic_args = {**all_keywords_to_pass_down, **kwargs_to_pass_down} + if "options" not in dynamic_args: + dynamic_args["options"] = {} + else: + # Handles the case where kwarg options and keyword $options both exist. We should merge both sets of options + # before creating the component + dynamic_args["options"] = {**all_keywords_to_pass_down["options"], **kwargs_to_pass_down["options"]} + try: + ret = func(*args, *fargs, **dynamic_args) + except TypeError as e: + raise Exception(f"failed to create object of type {func} because {e}") + return ret + + newfunc.func = func + newfunc.args = args + newfunc.kwargs = keywords + + return newfunc + + +def _get_kwargs_to_pass_to_func(func, options): + argspec = inspect.getfullargspec(func) + kwargs_to_pass_down = set(argspec.kwonlyargs) + args_to_pass_down = set(argspec.args) + all_args = args_to_pass_down.union(kwargs_to_pass_down) + kwargs_to_pass_down = {k: v for k, v in options.items() if k in all_args} + if "options" in all_args: + kwargs_to_pass_down["options"] = options + return kwargs_to_pass_down + + +def _create_inner_objects(keywords, kwargs): + fully_created = dict() + for k, v in keywords.items(): + if type(v) == type(create): + fully_created[k] = v(kwargs=kwargs) + else: + fully_created[k] = v + return fully_created diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py new file mode 100644 index 0000000000000..3832a103f6822 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/__init__.py @@ -0,0 +1,7 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime + +__all__ = ["MinMaxDatetime"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py new file mode 100644 index 0000000000000..0c4b5232cf696 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/datetime/min_max_datetime.py @@ -0,0 +1,85 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime as dt +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class MinMaxDatetime(JsonSchemaMixin): + """ + Compares the provided date against optional minimum or maximum times. If date is earlier than + min_date, then min_date is returned. If date is greater than max_date, then max_date is returned. + If neither, the input date is returned. + + The timestamp format accepts the same format codes as datetime.strfptime, which are + all the format codes required by the 1989 C standard. + Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html + + Attributes: + datetime (Union[InterpolatedString, str]): InterpolatedString or string representing the datetime in the format specified by `datetime_format` + datetime_format (str): Format of the datetime passed as argument + min_datetime (Union[InterpolatedString, str]): Represents the minimum allowed datetime value. + max_datetime (Union[InterpolatedString, str]): Represents the maximum allowed datetime value. + """ + + datetime: Union[InterpolatedString, str] + options: InitVar[Mapping[str, Any]] + # datetime_format is a unique case where we inherit it from the parent if it is not specified before using the default value + # which is why we need dedicated getter/setter methods and private dataclass field + datetime_format: str = "" + _datetime_format: str = field(init=False, repr=False, default="") + min_datetime: Union[InterpolatedString, str] = "" + max_datetime: Union[InterpolatedString, str] = "" + + def __post_init__(self, options: Mapping[str, Any]): + self.datetime = InterpolatedString.create(self.datetime, options=options or {}) + self.timezone = dt.timezone.utc + self.min_datetime = InterpolatedString.create(self.min_datetime, options=options) if self.min_datetime else None + self.max_datetime = InterpolatedString.create(self.max_datetime, options=options) if self.max_datetime else None + + self._timezone = dt.timezone.utc + + def get_datetime(self, config, **additional_options) -> dt.datetime: + """ + Evaluates and returns the datetime + :param config: The user-provided configuration as specified by the source's spec + :param additional_options: Additional arguments to be passed to the strings for interpolation + :return: The evaluated datetime + """ + # We apply a default datetime format here instead of at instantiation, so it can be set by the parent first + datetime_format = self._datetime_format + if not datetime_format: + datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" + + time = dt.datetime.strptime(str(self.datetime.eval(config, **additional_options)), datetime_format).replace(tzinfo=self._timezone) + + if self.min_datetime: + min_time = dt.datetime.strptime(str(self.min_datetime.eval(config, **additional_options)), datetime_format).replace( + tzinfo=self._timezone + ) + time = max(time, min_time) + if self.max_datetime: + max_time = dt.datetime.strptime(str(self.max_datetime.eval(config, **additional_options)), datetime_format).replace( + tzinfo=self._timezone + ) + time = min(time, max_time) + return time + + @property + def datetime_format(self) -> str: + """The format of the string representing the datetime""" + return self._datetime_format + + @datetime_format.setter + def datetime_format(self, value: str): + """Setter for the datetime format""" + # Covers the case where datetime_format is not provided in the constructor, which causes the property object + # to be set which we need to avoid doing + if not isinstance(value, property): + self._datetime_format = value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py new file mode 100644 index 0000000000000..6e79356ee93b7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_source.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Tuple + +from airbyte_cdk.sources.abstract_source import AbstractSource +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker + + +class DeclarativeSource(AbstractSource): + """ + Base class for declarative Source. Concrete sources need to define the connection_checker to use + """ + + @property + @abstractmethod + def connection_checker(self) -> ConnectionChecker: + """Returns the ConnectioChecker to use for the `check` operation""" + + def check_connection(self, logger, config) -> Tuple[bool, any]: + """ + :param logger: The source logger + :param config: The user-provided configuration as specified by the source's spec. + This usually contains information required to check connection e.g. tokens, secrets and keys etc. + :return: A tuple of (boolean, error). If boolean is true, then the connection check is successful + and we can connect to the underlying data source using the provided configuration. + Otherwise, the input config cannot be used to connect to the underlying data source, + and the "error" object should describe what went wrong. + The error object will be cast to string to display the problem to the user. + """ + return self.connection_checker.check_connection(self, logger, config) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py new file mode 100644 index 0000000000000..feae3fa4d51a6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/declarative_stream.py @@ -0,0 +1,143 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.declarative.types import Config, StreamSlice +from airbyte_cdk.sources.streams.core import Stream +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class DeclarativeStream(Stream, JsonSchemaMixin): + """ + DeclarativeStream is a Stream that delegates most of its logic to its schema_load and retriever + + Attributes: + stream_name (str): stream name + stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): the primary key of the stream + schema_loader (SchemaLoader): The schema loader + retriever (Retriever): The retriever + config (Config): The user-provided configuration as specified by the source's spec + stream_cursor_field (Optional[List[str]]): The cursor field + transformations (List[RecordTransformation]): A list of transformations to be applied to each output record in the + stream. Transformations are applied in the order in which they are defined. + checkpoint_interval (Optional[int]): How often the stream will checkpoint state (i.e: emit a STATE message) + """ + + schema_loader: SchemaLoader + retriever: Retriever + config: Config + options: InitVar[Mapping[str, Any]] + name: str + _name: str = field(init=False, repr=False) + primary_key: Optional[Union[str, List[str], List[List[str]]]] + _primary_key: str = field(init=False, repr=False) + stream_cursor_field: Optional[List[str]] = None + transformations: List[RecordTransformation] = None + checkpoint_interval: Optional[int] = None + + def __post_init__(self, options: Mapping[str, Any]): + self.stream_cursor_field = self.stream_cursor_field or [] + self.transformations = self.transformations or [] + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return self._primary_key + + @primary_key.setter + def primary_key(self, value: str) -> None: + if not isinstance(value, property): + self._primary_key = value + + @property + def name(self) -> str: + """ + :return: Stream name. By default this is the implementing class name, but it can be overridden as needed. + """ + return self._name + + @name.setter + def name(self, value: str) -> None: + if not isinstance(value, property): + self._name = value + + @property + def state_checkpoint_interval(self) -> Optional[int]: + """ + Decides how often to checkpoint state (i.e: emit a STATE message). E.g: if this returns a value of 100, then state is persisted after reading + 100 records, then 200, 300, etc.. A good default value is 1000 although your mileage may vary depending on the underlying data source. + + Checkpointing a stream avoids re-reading records in the case a sync is failed or cancelled. + + return None if state should not be checkpointed e.g: because records returned from the underlying data source are not returned in + ascending order with respect to the cursor field. This can happen if the source does not support reading records in ascending order of + created_at date (or whatever the cursor is). In those cases, state must only be saved once the full stream has been read. + """ + return self.checkpoint_interval + + @property + def state(self) -> MutableMapping[str, Any]: + return self.retriever.state + + @state.setter + def state(self, value: MutableMapping[str, Any]): + """State setter, accept state serialized by state getter.""" + self.retriever.state = value + + def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): + return self.state + + @property + def cursor_field(self) -> Union[str, List[str]]: + """ + Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field. + :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor. + """ + return self.stream_cursor_field + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + for record in self.retriever.read_records(sync_mode, cursor_field, stream_slice, stream_state): + yield self._apply_transformations(record, self.config, stream_slice) + + def _apply_transformations(self, record: Mapping[str, Any], config: Config, stream_slice: StreamSlice): + output_record = record + for transformation in self.transformations: + output_record = transformation.transform(record, config=config, stream_state=self.state, stream_slice=stream_slice) + + return output_record + + def get_json_schema(self) -> Mapping[str, Any]: + """ + :return: A dict of the JSON schema representing this stream. + + The default implementation of this method looks for a JSONSchema file with the same name as this stream's "name" property. + Override as needed. + """ + return self.schema_loader.get_json_schema() + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + """ + Override to define the slices for this stream. See the stream slicing section of the docs for more information. + + :param sync_mode: + :param cursor_field: + :param stream_state: + :return: + """ + # this is not passing the cursor field because it is known at init time + return self.retriever.stream_slices(sync_mode=sync_mode, stream_state=stream_state) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py new file mode 100644 index 0000000000000..64a933247bdb5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder + +__all__ = ["Decoder", "JsonDecoder"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py new file mode 100644 index 0000000000000..5ec36516f4fd2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/decoder.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, List, Mapping, Union + +import requests + + +@dataclass +class Decoder(ABC): + """ + Decoder strategy to transform a requests.Response into a Mapping[str, Any] + """ + + @abstractmethod + def decode(self, response: requests.Response) -> Union[Mapping[str, Any], List]: + """ + Decodes a requests.Response into a Mapping[str, Any] or an array + :param response: the response to decode + :return: Mapping or array describing the response + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py new file mode 100644 index 0000000000000..0cea903656845 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/decoders/json_decoder.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Union + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder + + +@dataclass +class JsonDecoder(Decoder): + """ + Decoder strategy that returns the json-encoded content of a response, if any. + """ + + options: InitVar[Mapping[str, Any]] + + def decode(self, response: requests.Response) -> Union[Mapping[str, Any], List]: + return response.json() or {} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py new file mode 100644 index 0000000000000..897f382ea0de2 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector + +__all__ = ["HttpSelector", "JelloExtractor", "RecordFilter", "RecordSelector"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py new file mode 100644 index 0000000000000..517f61c70b799 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/http_selector.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState + + +@dataclass +class HttpSelector(ABC): + """ + Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering + records based on a heuristic. + """ + + @abstractmethod + def select_records( + self, + response: requests.Response, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> List[Record]: + """ + Selects records from the response + :param response: The response to select the records from + :param stream_state: The stream state + :param stream_slice: The stream slice + :param next_page_token: The paginator token + :return: List of Records selected from the response + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/jello.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/jello.py new file mode 100644 index 0000000000000..f36613e2a56e7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/jello.py @@ -0,0 +1,43 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Union + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.types import Config, Record +from dataclasses_jsonschema import JsonSchemaMixin +from jello import lib as jello_lib + + +@dataclass +class JelloExtractor(JsonSchemaMixin): + """ + Record extractor that evaluates a Jello query to extract records from a decoded response. + + More information on Jello can be found at https://github.com/kellyjonbrazil/jello + + Attributes: + transform (Union[InterpolatedString, str]): The Jello query to evaluate on the decoded response + config (Config): The user-provided configuration as specified by the source's spec + decoder (Decoder): The decoder responsible to transfom the response in a Mapping + """ + + default_transform = "_" + transform: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + decoder: Decoder = JsonDecoder(options={}) + + def __post_init__(self, options: Mapping[str, Any]): + if isinstance(self.transform, str): + self.transform = InterpolatedString(string=self.transform, default=self.default_transform, options=options or {}) + + def extract_records(self, response: requests.Response) -> List[Record]: + response_body = self.decoder.decode(response) + script = self.transform.eval(self.config) + return jello_lib.pyquery(response_body, script) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py new file mode 100644 index 0000000000000..081dd75971300 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_filter.py @@ -0,0 +1,37 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional + +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class RecordFilter(JsonSchemaMixin): + """ + Filter applied on a list of Records + + config (Config): The user-provided configuration as specified by the source's spec + condition (str): The string representing the predicate to filter a record. Records will be removed if evaluated to False + """ + + options: InitVar[Mapping[str, Any]] + config: Config = field(default=dict) + condition: str = "" + + def __post_init__(self, options: Mapping[str, Any]): + self._filter_interpolator = InterpolatedBoolean(condition=self.condition, options=options) + + def filter_records( + self, + records: List[Record], + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> List[Record]: + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + return [record for record in records if self._filter_interpolator.eval(self.config, record=record, **kwargs)] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py new file mode 100644 index 0000000000000..dd738a69015d9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/extractors/record_selector.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class RecordSelector(HttpSelector, JsonSchemaMixin): + """ + Responsible for translating an HTTP response into a list of records by extracting records from the response and optionally filtering + records based on a heuristic. + + Attributes: + extractor (JelloExtractor): The record extractor responsible for extracting records from a response + record_filter (RecordFilter): The record filter responsible for filtering extracted records + """ + + extractor: JelloExtractor + options: InitVar[Mapping[str, Any]] + record_filter: RecordFilter = None + + def __post_init__(self, options: Mapping[str, Any]): + self._options = options + + def select_records( + self, + response: requests.Response, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> List[Record]: + all_records = self.extractor.extract_records(response) + # Some APIs don't wrap single records in a list + if not isinstance(all_records, list): + all_records = [all_records] + if self.record_filter: + return self.record_filter.filter_records( + all_records, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + return all_records diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py new file mode 100644 index 0000000000000..1f1b53a1910ac --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + +__all__ = ["InterpolatedBoolean", "InterpolatedMapping", "InterpolatedString"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py new file mode 100644 index 0000000000000..f7979dd69e46f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Final, List, Mapping + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.types import Config +from dataclasses_jsonschema import JsonSchemaMixin + +FALSE_VALUES: Final[List[Any]] = ["False", "false", "{}", "[]", "()", "", "0", "0.0", "False", "false", {}, False, [], (), set()] + + +@dataclass +class InterpolatedBoolean(JsonSchemaMixin): + f""" + Wrapper around a string to be evaluated to a boolean value. + The string will be evaluated as False if it interpolates to a value in {FALSE_VALUES} + + Attributes: + condition (str): The string representing the condition to evaluate to a boolean + """ + condition: str + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._default = "False" + self._interpolation = JinjaInterpolation() + self._options = options + + def eval(self, config: Config, **additional_options): + """ + Interpolates the predicate condition string using the config and other optional arguments passed as parameter. + + :param config: The user-provided configuration as specified by the source's spec + :param additional_options: Optional parameters used for interpolation + :return: The interpolated string + """ + if isinstance(self.condition, bool): + return self.condition + else: + evaluated = self._interpolation.eval(self.condition, config, self._default, options=self._options, **additional_options) + if evaluated in FALSE_VALUES: + return False + # The presence of a value is generally regarded as truthy, so we treat it as such + return True diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py new file mode 100644 index 0000000000000..6c1f80886a529 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.types import Config +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class InterpolatedMapping(JsonSchemaMixin): + """ + Wrapper around a Mapping[str, str] where both the keys and values are to be interpolated. + + Attributes: + mapping (Mapping[str, str]): to be evaluated + """ + + mapping: Mapping[str, str] + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Optional[Mapping[str, Any]]): + self._interpolation = JinjaInterpolation() + self._options = options + + def eval(self, config: Config, **additional_options): + """ + Wrapper around a Mapping[str, str] that allows for both keys and values to be interpolated. + + :param config: The user-provided configuration as specified by the source's spec + :param additional_options: Optional parameters used for interpolation + :return: The interpolated string + """ + interpolated_values = { + self._interpolation.eval(name, config, options=self._options, **additional_options): self._eval( + value, config, **additional_options + ) + for name, value in self.mapping.items() + } + return interpolated_values + + def _eval(self, value, config, **kwargs): + # The values in self._mapping can be of Any type + # We only want to interpolate them if they are strings + if type(value) == str: + return self._interpolation.eval(value, config, options=self._options, **kwargs) + else: + return value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py new file mode 100644 index 0000000000000..145be0d949d0e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolated_string.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.types import Config +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class InterpolatedString(JsonSchemaMixin): + """ + Wrapper around a raw string to be interpolated with the Jinja2 templating engine + + Attributes: + string (str): The string to evalute + default (Optional[str]): The default value to return if the evaluation returns an empty string + options (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + string: str + options: InitVar[Mapping[str, Any]] + default: Optional[str] = None + + def __post_init__(self, options: Mapping[str, Any]): + self.default = self.default or self.string + self._interpolation = JinjaInterpolation() + self._options = options + + def eval(self, config: Config, **kwargs): + """ + Interpolates the input string using the config and other optional arguments passed as parameter. + + :param config: The user-provided configuration as specified by the source's spec + :param kwargs: Optional parameters used for interpolation + :return: The interpolated string + """ + return self._interpolation.eval(self.string, config, self.default, options=self._options, **kwargs) + + def __eq__(self, other): + if not isinstance(other, InterpolatedString): + return False + return self.string == other.string and self.default == other.default + + @classmethod + def create( + cls, + string_or_interpolated: Union["InterpolatedString", str], + *, + options: Mapping[str, Any], + ): + """ + Helper function to obtain an InterpolatedString from either a raw string or an InterpolatedString. + + :param string_or_interpolated: either a raw string or an InterpolatedString. + :param options: options parameters propagated from parent component + :return: InterpolatedString representing the input string. + """ + if isinstance(string_or_interpolated, str): + return InterpolatedString(string=string_or_interpolated, options=options) + else: + return string_or_interpolated diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py new file mode 100644 index 0000000000000..6fb8fabb25888 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/interpolation.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from typing import Optional + +from airbyte_cdk.sources.declarative.types import Config + + +class Interpolation(ABC): + """ + Strategy for evaluating the interpolated value of a string at runtime using Jinja. + """ + + @abstractmethod + def eval(self, input_str: str, config: Config, default: Optional[str] = None, **additional_options): + """ + Interpolates the input string using the config, and additional options passed as parameter. + + :param input_str: The string to interpolate + :param config: The user-provided configuration as specified by the source's spec + :param default: Default value to return if the evaluation returns an empty string + :param additional_options: Optional parameters used for interpolation + :return: The interpolated string + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py new file mode 100644 index 0000000000000..883118fb5fdfc --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/jinja.py @@ -0,0 +1,64 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import ast +from typing import Optional + +from airbyte_cdk.sources.declarative.interpolation.interpolation import Interpolation +from airbyte_cdk.sources.declarative.interpolation.macros import macros +from airbyte_cdk.sources.declarative.types import Config +from jinja2 import Environment +from jinja2.exceptions import UndefinedError + + +class JinjaInterpolation(Interpolation): + """ + Interpolation strategy using the Jinja2 template engine. + + If the input string is a raw string, the interpolated string will be the same. + `eval("hello world") -> "hello world"` + + The engine will evaluate the content passed within {{}}, interpolating the keys from the config and context-specific arguments. + `eval("hello {{ name }}", name="airbyte") -> "hello airbyte")` + `eval("hello {{ config.name }}", config={"name": "airbyte"}) -> "hello airbyte")` + + In additional to passing additional values through the kwargs argument, macros can be called from within the string interpolation. + For example, + "{{ max(2, 3) }}" will return 3 + + Additional information on jinja templating can be found at https://jinja.palletsprojects.com/en/3.1.x/templates/# + """ + + def __init__(self): + self._environment = Environment() + self._environment.globals.update(**macros) + + def eval(self, input_str: str, config: Config, default: Optional[str] = None, **additional_options): + context = {"config": config, **additional_options} + try: + if isinstance(input_str, str): + result = self._eval(input_str, context) + if result: + return self._literal_eval(result) + else: + # If input is not a string, return it as is + raise Exception(f"Expected a string. got {input_str}") + except UndefinedError: + pass + # If result is empty or resulted in an undefined error, evaluate and return the default string + return self._literal_eval(self._eval(default, context)) + + def _literal_eval(self, result): + try: + return ast.literal_eval(result) + except (ValueError, SyntaxError): + return result + + def _eval(self, s: str, context): + try: + return self._environment.from_string(s).render(context) + except TypeError: + # The string is a static value, not a jinja template + # It can be returned as is + return s diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py new file mode 100644 index 0000000000000..018384886f1e9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/interpolation/macros.py @@ -0,0 +1,100 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import builtins +import datetime +import numbers +from typing import Union + +from dateutil import parser + +""" +This file contains macros that can be evaluated by a `JinjaInterpolation` object +""" + + +def now_local() -> datetime.datetime: + """ + Current local date and time. + + Usage: + `"{{ now_local() }}" + """ + return datetime.datetime.now() + + +def now_utc(): + """ + Current local date and time in UTC timezone + + Usage: + `"{{ now_utc() }}"` + """ + return datetime.datetime.now(datetime.timezone.utc) + + +def today_utc(): + """ + Current date in UTC timezone + + Usage: + `"{{ today_utc() }}"` + """ + return datetime.datetime.now(datetime.timezone.utc).date() + + +def timestamp(dt: Union[numbers.Number, str]): + """ + Converts a number or a string to a timestamp + + If dt is a number, then convert to an int + If dt is a string, then parse it using dateutil.parser + + Usage: + `"{{ timestamp(1658505815.223235) }}" + + :param dt: datetime to convert to timestamp + :return: unix timestamp + """ + if isinstance(dt, numbers.Number): + return int(dt) + else: + return int(parser.parse(dt).replace(tzinfo=datetime.timezone.utc).timestamp()) + + +def max(*args): + """ + Returns biggest object of an iterable, or two or more arguments. + + max(iterable, *[, default=obj, key=func]) -> value + max(arg1, arg2, *args, *[, key=func]) -> value + + Usage: + `"{{ max(2,3) }}" + + With a single iterable argument, return its biggest item. The + default keyword-only argument specifies an object to return if + the provided iterable is empty. + With two or more arguments, return the largest argument. + :param args: args to compare + :return: largest argument + """ + return builtins.max(*args) + + +def day_delta(num_days: int) -> str: + """ + Returns datetime of now() + num_days + + Usage: + `"{{ day_delta(25) }}"` + + :param num_days: number of days to add to current date time + :return: datetime formatted as RFC3339 + """ + return (datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=num_days)).strftime("%Y-%m-%dT%H:%M:%S.%f%z") + + +_macros_list = [now_local, now_utc, today_utc, timestamp, max, day_delta] +macros = {f.__name__: f for f in _macros_list} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py new file mode 100644 index 0000000000000..ad0c268e1ac15 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/class_types_registry.py @@ -0,0 +1,64 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Mapping, Type + +from airbyte_cdk.sources.declarative.auth.token import ApiKeyAuthenticator, BasicHttpAuthenticator, BearerAuthenticator +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator import LimitPaginator +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.declarative.schema.json_schema import JsonSchema +from airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer import CartesianProductStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer import DatetimeStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer import ListStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.substream_slicer import SubstreamSlicer +from airbyte_cdk.sources.declarative.transformations import RemoveFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddFields + +""" +CLASS_TYPES_REGISTRY contains a mapping of developer-friendly string -> class to abstract the specific class referred to +""" +CLASS_TYPES_REGISTRY: Mapping[str, Type] = { + "AddFields": AddFields, + "ApiKeyAuthenticator": ApiKeyAuthenticator, + "BasicHttpAuthenticator": BasicHttpAuthenticator, + "BearerAuthenticator": BearerAuthenticator, + "CartesianProductStreamSlicer": CartesianProductStreamSlicer, + "CompositeErrorHandler": CompositeErrorHandler, + "ConstantBackoffStrategy": ConstantBackoffStrategy, + "CursorPagination": CursorPaginationStrategy, + "DatetimeStreamSlicer": DatetimeStreamSlicer, + "DeclarativeStream": DeclarativeStream, + "DefaultErrorHandler": DefaultErrorHandler, + "ExponentialBackoffStrategy": ExponentialBackoffStrategy, + "HttpRequester": HttpRequester, + "InterpolatedBoolean": InterpolatedBoolean, + "InterpolatedString": InterpolatedString, + "JelloExtractor": JelloExtractor, + "JsonSchema": JsonSchema, + "LimitPaginator": LimitPaginator, + "ListStreamSlicer": ListStreamSlicer, + "MinMaxDatetime": MinMaxDatetime, + "NoPagination": NoPagination, + "OffsetIncrement": OffsetIncrement, + "RecordSelector": RecordSelector, + "RemoveFields": RemoveFields, + "SimpleRetriever": SimpleRetriever, + "SubstreamSlicer": SubstreamSlicer, +} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/config_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/config_parser.py new file mode 100644 index 0000000000000..06c61f1215448 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/config_parser.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod + +from airbyte_cdk.sources.declarative.types import ConnectionDefinition + + +class ConnectionDefinitionParser(ABC): + """ + Parses a string to a ConnectionDefinition + """ + + @abstractmethod + def parse(self, config_str: str) -> ConnectionDefinition: + """Parses the config_str to a ConnectionDefinition""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py new file mode 100644 index 0000000000000..f09c00d954e85 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Mapping, Type + +from airbyte_cdk.sources.declarative.checks.check_stream import CheckStream +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator import RequestOption +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.requester import Requester +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.declarative.schema.json_schema import JsonSchema +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.declarative.stream_slicers.single_slice import SingleSlice +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.substream_slicer import ParentStreamConfig +from airbyte_cdk.sources.streams.core import Stream + +""" +DEFAULT_IMPLEMENTATIONS_REGISTRY contains a mapping of interface -> subclass +enabling the factory to instantiate a reasonable default class when no type or classname is specified +""" +DEFAULT_IMPLEMENTATIONS_REGISTRY: Mapping[Type, Type] = { + ConnectionChecker: CheckStream, + Decoder: JsonDecoder, + ErrorHandler: DefaultErrorHandler, + HttpResponseFilter: HttpResponseFilter, + HttpSelector: RecordSelector, + InterpolatedBoolean: InterpolatedBoolean, + InterpolatedRequestOptionsProvider: InterpolatedRequestOptionsProvider, + InterpolatedString: InterpolatedString, + MinMaxDatetime: MinMaxDatetime, + Paginator: NoPagination, + RequestOption: RequestOption, + RequestOptionsProvider: InterpolatedRequestOptionsProvider, + Requester: HttpRequester, + Retriever: SimpleRetriever, + ParentStreamConfig: ParentStreamConfig, + SchemaLoader: JsonSchema, + Stream: DeclarativeStream, + StreamSlicer: SingleSlice, +} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py new file mode 100644 index 0000000000000..6303b05ca1f82 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/factory.py @@ -0,0 +1,240 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +import copy +import enum +import importlib +from typing import Any, List, Literal, Mapping, Type, Union, get_args, get_origin, get_type_hints + +from airbyte_cdk.sources.declarative.create_partial import OPTIONS_STR, create +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.parsers.class_types_registry import CLASS_TYPES_REGISTRY +from airbyte_cdk.sources.declarative.parsers.default_implementation_registry import DEFAULT_IMPLEMENTATIONS_REGISTRY +from airbyte_cdk.sources.declarative.types import Config + +ComponentDefinition: Union[Literal, Mapping, List] + + +class DeclarativeComponentFactory: + """ + Instantiates objects from a Mapping[str, Any] defining the object to create. + + If the component is a literal, then it is returned as is: + ``` + 3 + ``` + will result in + ``` + 3 + ``` + + If the component is a mapping with a "class_name" field, + an object of type "class_name" will be instantiated by passing the mapping's other fields to the constructor + ``` + { + "class_name": "fully_qualified.class_name", + "a_parameter: 3, + "another_parameter: "hello" + } + ``` + will result in + ``` + fully_qualified.class_name(a_parameter=3, another_parameter="helo" + ``` + + If the component definition is a mapping with a "type" field, + the factory will lookup the `CLASS_TYPES_REGISTRY` and replace the "type" field by "class_name" -> CLASS_TYPES_REGISTRY[type] + and instantiate the object from the resulting mapping + + If the component definition is a mapping with neither a "class_name" nor a "type" field, + the factory will do a best-effort attempt at inferring the component type by looking up the parent object's constructor type hints. + If the type hint is an interface present in `DEFAULT_IMPLEMENTATIONS_REGISTRY`, + then the factory will create an object of it's default implementation. + + If the component definition is a list, then the factory will iterate over the elements of the list, + instantiate its subcomponents, and return a list of instantiated objects. + + If the component has subcomponents, the factory will create the subcomponents before instantiating the top level object + ``` + { + "type": TopLevel + "param": + { + "type": "ParamType" + "k": "v" + } + } + ``` + will result in + ``` + TopLevel(param=ParamType(k="v")) + ``` + + Parameters can be passed down from a parent component to its subcomponents using the $options key. + This can be used to avoid repetitions. + ``` + outer: + $options: + MyKey: MyValue + inner: + k2: v2 + ``` + This the example above, if both outer and inner are types with a "MyKey" field, both of them will evaluate to "MyValue". + + The value can also be used for string interpolation: + ``` + outer: + $options: + MyKey: MyValue + inner: + k2: "MyKey is {{ options.MyKey }}" + ``` + In this example, outer.inner.k2 will evaluate to "MyValue" + + """ + + def __init__(self): + self._interpolator = JinjaInterpolation() + + def create_component(self, component_definition: ComponentDefinition, config: Config): + """ + Create a component defined by `component_definition`. + + This method will also traverse and instantiate its subcomponents if needed. + :param component_definition: The definition of the object to create. + :param config: Connector's config + :return: The object to create + """ + kwargs = copy.deepcopy(component_definition) + if "class_name" in kwargs: + class_name = kwargs.pop("class_name") + elif "type" in kwargs: + class_name = CLASS_TYPES_REGISTRY[kwargs.pop("type")] + else: + raise ValueError(f"Failed to create component because it has no class_name or type. Definition: {component_definition}") + return self.build(class_name, config, **kwargs) + + def build(self, class_or_class_name: Union[str, Type], config, **kwargs): + if isinstance(class_or_class_name, str): + class_ = self._get_class_from_fully_qualified_class_name(class_or_class_name) + else: + class_ = class_or_class_name + + # create components in options before propagating them + if OPTIONS_STR in kwargs: + kwargs[OPTIONS_STR] = {k: self._create_subcomponent(k, v, kwargs, config, class_) for k, v in kwargs[OPTIONS_STR].items()} + + updated_kwargs = {k: self._create_subcomponent(k, v, kwargs, config, class_) for k, v in kwargs.items()} + return create(class_, config=config, **updated_kwargs) + + @staticmethod + def _get_class_from_fully_qualified_class_name(class_name: str): + split = class_name.split(".") + module = ".".join(split[:-1]) + class_name = split[-1] + return getattr(importlib.import_module(module), class_name) + + @staticmethod + def _merge_dicts(d1, d2): + return {**d1, **d2} + + def _create_subcomponent(self, key, definition, kwargs, config, parent_class): + """ + There are 5 ways to define a component. + 1. dict with "class_name" field -> create an object of type "class_name" + 2. dict with "type" field -> lookup the `CLASS_TYPES_REGISTRY` to find the type of object and create an object of that type + 3. a dict with a type that can be inferred. If the parent class's constructor has type hints, we can infer the type of the object to create by looking up the `DEFAULT_IMPLEMENTATIONS_REGISTRY` map + 4. list: loop over the list and create objects for its items + 5. anything else -> return as is + """ + if self.is_object_definition_with_class_name(definition): + # propagate kwargs to inner objects + definition[OPTIONS_STR] = self._merge_dicts(kwargs.get(OPTIONS_STR, dict()), definition.get(OPTIONS_STR, dict())) + return self.create_component(definition, config)() + elif self.is_object_definition_with_type(definition): + # If type is set instead of class_name, get the class_name from the CLASS_TYPES_REGISTRY + definition[OPTIONS_STR] = self._merge_dicts(kwargs.get(OPTIONS_STR, dict()), definition.get(OPTIONS_STR, dict())) + object_type = definition.pop("type") + class_name = CLASS_TYPES_REGISTRY[object_type] + definition["class_name"] = class_name + return self.create_component(definition, config)() + elif isinstance(definition, dict): + # Try to infer object type + expected_type = self.get_default_type(key, parent_class) + # if there is an expected type, and it's not a builtin type, then instantiate it + # We don't have to instantiate builtin types (eg string and dict) because definition is already going to be of that type + if expected_type and not self._is_builtin_type(expected_type): + definition["class_name"] = expected_type + definition[OPTIONS_STR] = self._merge_dicts(kwargs.get(OPTIONS_STR, dict()), definition.get(OPTIONS_STR, dict())) + return self.create_component(definition, config)() + else: + return definition + elif isinstance(definition, list): + return [ + self._create_subcomponent( + key, sub, self._merge_dicts(kwargs.get(OPTIONS_STR, dict()), self._get_subcomponent_options(sub)), config, parent_class + ) + for sub in definition + ] + else: + expected_type = self.get_default_type(key, parent_class) + if expected_type and not isinstance(definition, expected_type): + # call __init__(definition) if definition is not a dict and is not of the expected type + # for instance, to turn a string into an InterpolatedString + options = kwargs.get(OPTIONS_STR, {}) + try: + # enums can't accept options + if issubclass(expected_type, enum.Enum): + return expected_type(definition) + else: + return expected_type(definition, options=options) + except Exception as e: + raise Exception(f"failed to instantiate type {expected_type}. {e}") + else: + return definition + + @staticmethod + def is_object_definition_with_class_name(definition): + return isinstance(definition, dict) and "class_name" in definition + + @staticmethod + def is_object_definition_with_type(definition): + return isinstance(definition, dict) and "type" in definition + + @staticmethod + def get_default_type(parameter_name, parent_class): + type_hints = get_type_hints(parent_class.__init__) + interface = type_hints.get(parameter_name) + while True: + origin = get_origin(interface) + if origin: + # Unnest types until we reach the raw type + # List[T] -> T + # Optional[List[T]] -> T + args = get_args(interface) + interface = args[0] + else: + break + + expected_type = DEFAULT_IMPLEMENTATIONS_REGISTRY.get(interface) + + if expected_type: + return expected_type + else: + return interface + + @staticmethod + def _get_subcomponent_options(sub: Any): + if isinstance(sub, dict): + return sub.get(OPTIONS_STR, {}) + else: + return {} + + @staticmethod + def _is_builtin_type(cls) -> bool: + if not cls: + return False + return cls.__module__ == "builtins" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/undefined_reference_exception.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/undefined_reference_exception.py new file mode 100644 index 0000000000000..f9721edbf7eec --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/undefined_reference_exception.py @@ -0,0 +1,12 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +class UndefinedReferenceException(Exception): + """ + Raised when refering to an undefined reference. + """ + + def __init__(self, path, reference): + super().__init__(f"Undefined reference {reference} from {path}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/yaml_parser.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/yaml_parser.py new file mode 100644 index 0000000000000..b9885c6e1043c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/parsers/yaml_parser.py @@ -0,0 +1,202 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from copy import deepcopy +from typing import Any, Mapping, Tuple, Union + +import yaml +from airbyte_cdk.sources.declarative.parsers.config_parser import ConnectionDefinitionParser +from airbyte_cdk.sources.declarative.parsers.undefined_reference_exception import UndefinedReferenceException +from airbyte_cdk.sources.declarative.types import ConnectionDefinition + + +class YamlParser(ConnectionDefinitionParser): + """ + Parses a Yaml string to a ConnectionDefinition + + In addition to standard Yaml parsing, the input_string can contain refererences to values previously defined. + This parser will dereference these values to produce a complete ConnectionDefinition. + + References can be defined using a *ref() string. + ``` + key: 1234 + reference: "*ref(key)" + ``` + will produce the following definition: + ``` + key: 1234 + reference: 1234 + ``` + This also works with objects: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: "*ref(key_value_pairs)" + ``` + will produce the following definition: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + k1: v1 + k2: v2 + ``` + + The $ref keyword can be used to refer to an object and enhance it with addition key-value pairs + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + $ref: "*ref(key_value_pairs)" + k3: v3 + ``` + will produce the following definition: + ``` + key_value_pairs: + k1: v1 + k2: v2 + same_key_value_pairs: + k1: v1 + k2: v2 + k3: v3 + ``` + + References can also point to nested values. + Nested references are ambiguous because one could define a key containing with `.` + in this example, we want to refer to the limit key in the dict object: + ``` + dict: + limit: 50 + limit_ref: "*ref(dict.limit)" + ``` + will produce the following definition: + ``` + dict + limit: 50 + limit-ref: 50 + ``` + + whereas here we want to access the `nested.path` value. + ``` + nested: + path: "first one" + nested.path: "uh oh" + value: "ref(nested.path) + ``` + will produce the following definition: + ``` + nested: + path: "first one" + nested.path: "uh oh" + value: "uh oh" + ``` + + to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward + until we find a key with the given path, or until there is nothing to traverse. + """ + + ref_tag = "$ref" + + def parse(self, connection_definition_str: str) -> ConnectionDefinition: + """ + Parses a yaml file and dereferences string in the form "*ref({reference)" + to {reference} + :param connection_definition_str: yaml string to parse + :return: The ConnectionDefinition parsed from connection_definition_str + """ + input_mapping = yaml.safe_load(connection_definition_str) + evaluated_definition = {} + return self._preprocess_dict(input_mapping, evaluated_definition, "") + + def _preprocess_dict(self, input_mapping: Mapping[str, Any], evaluated_mapping: Mapping[str, Any], path: Union[str, Tuple[str]]): + + """ + :param input_mapping: mapping produced by parsing yaml + :param evaluated_mapping: mapping produced by dereferencing the content of input_mapping + :param path: curent path in configuration traversal + :return: + """ + d = {} + if self.ref_tag in input_mapping: + partial_ref_string = input_mapping[self.ref_tag] + d = deepcopy(self._preprocess(partial_ref_string, evaluated_mapping, path)) + + for key, value in input_mapping.items(): + if key == self.ref_tag: + continue + full_path = self._resolve_value(key, path) + if full_path in evaluated_mapping: + raise Exception(f"Databag already contains key={key} with path {full_path}") + processed_value = self._preprocess(value, evaluated_mapping, full_path) + evaluated_mapping[full_path] = processed_value + d[key] = processed_value + + return d + + def _get_ref_key(self, s: str) -> str: + ref_start = s.find("*ref(") + if ref_start == -1: + return None + return s[ref_start + 5 : s.find(")")] + + def _resolve_value(self, value: str, path): + if path: + return *path, value + else: + return (value,) + + def _preprocess(self, value, evaluated_config: Mapping[str, Any], path): + if isinstance(value, str): + ref_key = self._get_ref_key(value) + if ref_key is None: + return value + else: + """ + references are ambiguous because one could define a key containing with `.` + in this example, we want to refer to the limit key in the dict object: + dict: + limit: 50 + limit_ref: "*ref(dict.limit)" + + whereas here we want to access the `nested.path` value. + nested: + path: "first one" + nested.path: "uh oh" + value: "ref(nested.path) + + to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward + until we find a key with the given path, or until there is nothing to traverse. + """ + key = (ref_key,) + while key[-1]: + if key in evaluated_config: + return evaluated_config[key] + else: + split = key[-1].split(".") + key = *key[:-1], split[0], ".".join(split[1:]) + raise UndefinedReferenceException(path, ref_key) + elif isinstance(value, dict): + return self._preprocess_dict(value, evaluated_config, path) + elif type(value) == list: + evaluated_list = [ + # pass in elem's path instead of the list's path + self._preprocess(v, evaluated_config, self._get_path_for_list_item(path, index)) + for index, v in enumerate(value) + ] + # Add the list's element to the evaluated config so they can be referenced + for index, elem in enumerate(evaluated_list): + evaluated_config[self._get_path_for_list_item(path, index)] = elem + return evaluated_list + else: + return value + + def _get_path_for_list_item(self, path, index): + # An elem's path is {path_to_list}[{index}] + if len(path) > 1: + return path[:-1], f"{path[-1]}[{index}]" + else: + return (f"{path[-1]}[{index}]",) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/read_exception.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/read_exception.py new file mode 100644 index 0000000000000..160cdcb43f0cb --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/read_exception.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +class ReadException(Exception): + """ + Raise when there is an error reading data from an API Source + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py new file mode 100644 index 0000000000000..ca8377e6fc979 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption +from airbyte_cdk.sources.declarative.requesters.requester import Requester + +__all__ = ["HttpRequester", "RequestOption", "Requester"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py new file mode 100644 index 0000000000000..f2602eea94b53 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py @@ -0,0 +1,11 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter + +__all__ = ["BackoffStrategy", "CompositeErrorHandler", "DefaultErrorHandler", "ErrorHandler", "HttpResponseFilter"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py new file mode 100644 index 0000000000000..15472c2bd76a9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy import ( + WaitTimeFromHeaderBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy import ( + WaitUntilTimeFromHeaderBackoffStrategy, +) + +__all__ = [ + "ConstantBackoffStrategy", + "ExponentialBackoffStrategy", + "WaitTimeFromHeaderBackoffStrategy", + "WaitUntilTimeFromHeaderBackoffStrategy", +] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py new file mode 100644 index 0000000000000..3a7df2dc7b365 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class ConstantBackoffStrategy(BackoffStrategy, JsonSchemaMixin): + """ + Backoff strategy with a constant backoff interval + + Attributes: + backoff_time_in_seconds (float): time to backoff before retrying a retryable request. + """ + + backoff_time_in_seconds: float + + def backoff(self, response: requests.Response, attempt_count: int) -> Optional[float]: + return self.backoff_time_in_seconds diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py new file mode 100644 index 0000000000000..75a52ffca3756 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import dataclass +from typing import Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class ExponentialBackoffStrategy(BackoffStrategy, JsonSchemaMixin): + """ + Backoff strategy with an exponential backoff interval + + Attributes: + factor (float): multiplicative factor + """ + + factor: float = 5 + + def backoff(self, response: requests.Response, attempt_count: int) -> Optional[float]: + return self.factor * 2**attempt_count diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py new file mode 100644 index 0000000000000..f3a17b7388509 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import numbers +from re import Pattern +from typing import Optional + +import requests + + +def get_numeric_value_from_header(response: requests.Response, header: str, regex: Optional[Pattern]) -> Optional[float]: + """ + Extract a header value from the response as a float + :param response: response the extract header value from + :param header: Header to extract + :param regex: optional regex to apply on the header to obtain the value + :return: header value as float if it's a number. None otherwise + """ + header_value = response.headers.get(header, None) + if not header_value: + return None + if isinstance(header_value, str): + if regex: + match = regex.match(header_value) + if match: + header_value = match.group() + return _as_float(header_value) + elif isinstance(header_value, numbers.Number): + return float(header_value) + else: + return None + + +def _as_float(s: str) -> Optional[float]: + try: + return float(s) + except ValueError: + return None diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py new file mode 100644 index 0000000000000..3ff279c78eb5c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import re +from dataclasses import dataclass +from typing import Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class WaitTimeFromHeaderBackoffStrategy(BackoffStrategy, JsonSchemaMixin): + """ + Extract wait time from http header + + Attributes: + header (str): header to read wait time from + regex (Optional[str]): optional regex to apply on the header to extract its value + """ + + header: str + regex: Optional[str] = None + + def __post_init__(self): + self.regex = re.compile(self.regex) if self.regex else None + + def backoff(self, response: requests.Response, attempt_count: int) -> Optional[float]: + header_value = get_numeric_value_from_header(response, self.header, self.regex) + return header_value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py new file mode 100644 index 0000000000000..0e56741035ba6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py @@ -0,0 +1,49 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import numbers +import re +import time +from dataclasses import dataclass +from typing import Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class WaitUntilTimeFromHeaderBackoffStrategy(BackoffStrategy, JsonSchemaMixin): + """ + Extract time at which we can retry the request from response header + and wait for the difference between now and that time + + Attributes: + header (str): header to read wait time from + min_wait (Optional[float]): minimum time to wait for safety + regex (Optional[str]): optional regex to apply on the header to extract its value + """ + + header: str + min_wait: Optional[float] = None + regex: Optional[str] = None + + def __post_init__(self): + self.regex = re.compile(self.regex) if self.regex else None + + def backoff(self, response: requests.Response, attempt_count: int) -> Optional[float]: + now = time.time() + wait_until = get_numeric_value_from_header(response, self.header, self.regex) + if wait_until is None or not wait_until: + return self.min_wait + if (isinstance(wait_until, str) and wait_until.isnumeric()) or isinstance(wait_until, numbers.Number): + wait_time = float(wait_until) - now + else: + return self.min_wait + if self.min_wait: + return max(wait_time, self.min_wait) + elif wait_time < 0: + return None + return wait_time diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py new file mode 100644 index 0000000000000..00c1b6dff23b6 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Optional + +import requests + + +@dataclass +class BackoffStrategy: + """ + Backoff strategy defining how long to wait before retrying a request that resulted in an error. + """ + + @abstractmethod + def backoff(self, response: requests.Response, attempt_count: int) -> Optional[float]: + """ + Return time to wait before retrying the request. + :param response: response received for the request to retry + :param attempt_count: number of attempts to submit the request + :return: time to wait in seconds + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py new file mode 100644 index 0000000000000..0c2cfe5da878f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py @@ -0,0 +1,60 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Union + +import airbyte_cdk.sources.declarative.requesters.error_handlers.response_status as response_status +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class CompositeErrorHandler(ErrorHandler, JsonSchemaMixin): + """ + Error handler that sequentially iterates over a list of `ErrorHandler`s + + Sample config chaining 2 different retriers: + error_handler: + type: "CompositeErrorHandler" + error_handlers: + - response_filters: + - predicate: "{{ 'codase' in response }}" + action: RETRY + backoff_strategies: + - type: "ConstantBackoffStrategy" + backoff_time_in_seconds: 5 + - response_filters: + - http_codes: [ 403 ] + action: RETRY + backoff_strategies: + - type: "ConstantBackoffStrategy" + backoff_time_in_seconds: 10 + Attributes: + error_handlers (List[ErrorHandler]): list of error handlers + """ + + error_handlers: List[ErrorHandler] + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + if not self.error_handlers: + raise ValueError("CompositeErrorHandler expects at least 1 underlying error handler") + + @property + def max_retries(self) -> Union[int, None]: + return self.error_handlers[0].max_retries + + def should_retry(self, response: requests.Response) -> ResponseStatus: + should_retry = None + for retrier in self.error_handlers: + should_retry = retrier.should_retry(response) + if should_retry.action == ResponseAction.SUCCESS: + return response_status.SUCCESS + if should_retry == response_status.IGNORE or should_retry.action == ResponseAction.RETRY: + return should_retry + return should_retry diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py new file mode 100644 index 0000000000000..179db638661f5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py @@ -0,0 +1,149 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, MutableMapping, Optional, Union + +import airbyte_cdk.sources.declarative.requesters.error_handlers.response_status as response_status +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy import BackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class DefaultErrorHandler(ErrorHandler, JsonSchemaMixin): + """ + Default error handler. + + By default, the handler will only retry server errors (HTTP 5XX) and too many requests (HTTP 429) with exponential backoff. + + If the response is successful, then return SUCCESS + Otherwise, iterate over the response_filters. + If any of the filter match the response, then return the appropriate status. + If the match is RETRY, then iterate sequentially over the backoff_strategies and return the first non-None backoff time. + + Sample configs: + + 1. retry 10 times + ` + error_handler: + max_retries: 10 + ` + 2. backoff for 5 seconds + ` + error_handler: + backoff_strategies: + - type: "ConstantBackoffStrategy" + backoff_time_in_seconds: 5 + ` + 3. retry on HTTP 404 + ` + error_handler: + response_filters: + - http_codes: [ 404 ] + action: RETRY + ` + 4. ignore HTTP 404 + ` + error_handler: + - http_codes: [ 404 ] + action: IGNORE + ` + 5. retry if error message contains `retrythisrequest!` substring + ` + error_handler: + response_filters: + - error_message_contain: "retrythisrequest!" + action: IGNORE + ` + 6. retry if 'code' is a field present in the response body + ` + error_handler: + response_filters: + - predicate: "{{ 'code' in response }}" + action: IGNORE + ` + + 7. ignore 429 and retry on 404 + ` + error_handler: + - http_codes: [ 429 ] + action: IGNORE + - http_codes: [ 404 ] + action: RETRY + ` + + Attributes: + response_filters (Optional[List[HttpResponseFilter]]): response filters to iterate on + max_retries (Optional[int]): maximum retry attempts + backoff_strategies (Optional[List[BackoffStrategy]]): list of backoff strategies to use to determine how long + to wait before retrying + """ + + DEFAULT_BACKOFF_STRATEGY = ExponentialBackoffStrategy + + options: InitVar[Mapping[str, Any]] + response_filters: Optional[List[HttpResponseFilter]] = None + max_retries: Optional[int] = 5 + _max_retries: int = field(init=False, repr=False, default=5) + backoff_strategies: Optional[List[BackoffStrategy]] = None + + def __post_init__(self, options: Mapping[str, Any]): + self.response_filters = self.response_filters or [] + + if not self.response_filters: + self.response_filters.append( + HttpResponseFilter(ResponseAction.RETRY, http_codes=HttpResponseFilter.DEFAULT_RETRIABLE_ERRORS, options={}) + ) + self.response_filters.append(HttpResponseFilter(ResponseAction.IGNORE, options={})) + + if not self.backoff_strategies: + self.backoff_strategies = [DefaultErrorHandler.DEFAULT_BACKOFF_STRATEGY()] + + self._last_request_to_attempt_count: MutableMapping[requests.PreparedRequest, int] = {} + + @property + def max_retries(self) -> Union[int, None]: + return self._max_retries + + @max_retries.setter + def max_retries(self, value: Union[int, None]): + # Covers the case where max_retries is not provided in the constructor, which causes the property object + # to be set which we need to avoid doing + if not isinstance(value, property): + self._max_retries = value + + def should_retry(self, response: requests.Response) -> ResponseStatus: + request = response.request + + if request not in self._last_request_to_attempt_count: + self._last_request_to_attempt_count = {request: 1} + else: + self._last_request_to_attempt_count[request] += 1 + for response_filter in self.response_filters: + filter_action = response_filter.matches(response) + if filter_action is not None: + if filter_action == ResponseAction.RETRY: + return ResponseStatus(ResponseAction.RETRY, self._backoff_time(response, self._last_request_to_attempt_count[request])) + else: + return ResponseStatus(filter_action) + if response.ok: + return response_status.SUCCESS + # Fail if the response matches no filters + return response_status.FAIL + + def _backoff_time(self, response: requests.Response, attempt_count: int) -> Optional[float]: + backoff = None + for backoff_strategies in self.backoff_strategies: + backoff = backoff_strategies.backoff(response, attempt_count) + if backoff: + return backoff + return backoff diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py new file mode 100644 index 0000000000000..50b6412ad350e --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Union + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus + + +@dataclass +class ErrorHandler(ABC): + """ + Defines whether a request was successful and how to handle a failure. + """ + + @property + @abstractmethod + def max_retries(self) -> Union[int, None]: + """ + Specifies maximum amount of retries for backoff policy. Return None for no limit. + """ + pass + + @abstractmethod + def should_retry(self, response: requests.Response) -> ResponseStatus: + """ + Evaluate response status describing whether a failing request should be retried or ignored. + + :param response: response to evaluate + :return: response status + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py new file mode 100644 index 0000000000000..b37652383b586 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py @@ -0,0 +1,66 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Optional, Set, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.streams.http.http import HttpStream +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class HttpResponseFilter(JsonSchemaMixin): + """ + Filter to select HttpResponses + + Attributes: + action (Union[ResponseAction, str]): action to execute if a request matches + http_codes (Set[int]): http code of matching requests + error_message_contains (str): error substring of matching requests + predicate (str): predicate to apply to determine if a request is matching + """ + + TOO_MANY_REQUESTS_ERRORS = {429} + DEFAULT_RETRIABLE_ERRORS = set([x for x in range(500, 600)]).union(TOO_MANY_REQUESTS_ERRORS) + + action: Union[ResponseAction, str] + options: InitVar[Mapping[str, Any]] + http_codes: Set[int] = None + error_message_contains: str = None + predicate: Union[InterpolatedBoolean, str] = "" + + def __post_init__(self, options: Mapping[str, Any]): + if isinstance(self.action, str): + self.action = ResponseAction[self.action] + self.http_codes = self.http_codes or set() + if isinstance(self.predicate, str): + self.predicate = InterpolatedBoolean(condition=self.predicate, options=options) + + def matches(self, response: requests.Response) -> Optional[ResponseAction]: + """ + Apply the filter on the response and return the action to execute if it matches + :param response: The HTTP response to evaluate + :return: The action to execute. None if the response does not match the filter + """ + if ( + response.status_code in self.http_codes + or (self._response_matches_predicate(response)) + or (self._response_contains_error_message(response)) + ): + return self.action + else: + return None + + def _response_matches_predicate(self, response: requests.Response) -> bool: + return self.predicate and self.predicate.eval(None, response=response.json(), headers=response.headers) + + def _response_contains_error_message(self, response: requests.Response) -> bool: + if not self.error_message_contains: + return False + else: + error_message = HttpStream.parse_response_error_message(response) + return error_message and self.error_message_contains in error_message diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py new file mode 100644 index 0000000000000..2a5fd7e00c5ee --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from enum import Enum + + +class ResponseAction(Enum): + """ + Response statuses for non retriable responses + """ + + SUCCESS = "SUCCESS" # "Request was successful" + FAIL = "FAIL" # "Request failed unexpectedly" + IGNORE = "IGNORE" # "Request failed but can be ignored" + RETRY = "RETRY" # Request failed and should be retried diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py new file mode 100644 index 0000000000000..d089cef88f6bf --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Final, Optional, Union + +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction + + +class ResponseStatus: + """ + ResponseAction amended with backoff time if a action is RETRY + """ + + def __init__(self, response_action: Union[ResponseAction, str], retry_in: Optional[float] = None): + """ + :param response_action: response action to execute + :param retry_in: backoff time (if action is RETRY) + """ + if isinstance(response_action, str): + response_action = ResponseAction[response_action] + if retry_in and response_action != ResponseAction.RETRY: + raise ValueError(f"Unexpected backoff time ({retry_in} for non-retryable response action {response_action}") + self._retry_in = retry_in + self._action = response_action + + @property + def action(self): + """The ResponseAction to execute when a response matches the filter""" + return self._action + + @property + def retry_in(self) -> Optional[float]: + """How long to backoff before retrying a response. None if no wait required.""" + return self._retry_in + + @classmethod + def retry(cls, retry_in: Optional[float]) -> "ResponseStatus": + """ + Returns a ResponseStatus defining how long to backoff before retrying + + :param retry_in: how long to backoff before retrying. None if no wait required + :return: A response status defining how long to backoff before retrying + """ + return ResponseStatus(ResponseAction.RETRY, retry_in) + + def __eq__(self, other): + if not other: + return not self + return self.action == other.action and self.retry_in == other.retry_in + + def __hash__(self): + return hash([self.action, self.retry_in]) + + +"""Response is successful. No need to retry""" +SUCCESS: Final[ResponseStatus] = ResponseStatus(ResponseAction.SUCCESS) +"""Response is unsuccessful. The failure needs to be handled""" +FAIL: Final[ResponseStatus] = ResponseStatus(ResponseAction.FAIL) +"""Response is unsuccessful, but can be ignored. No need to retry""" +IGNORE: Final[ResponseStatus] = ResponseStatus(ResponseAction.IGNORE) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py new file mode 100644 index 0000000000000..4658e66c704f3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/http_requester.py @@ -0,0 +1,156 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from functools import lru_cache +from typing import Any, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler import ErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester +from airbyte_cdk.sources.declarative.types import Config, StreamSlice, StreamState +from airbyte_cdk.sources.streams.http.auth import HttpAuthenticator, NoAuth +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class HttpRequester(Requester, JsonSchemaMixin): + """ + Default implementation of a Requester + + Attributes: + name (str): Name of the stream. Only used for request/response caching + url_base (InterpolatedString): Base url to send requests to + path (InterpolatedString): Path to send requests to + http_method (Union[str, HttpMethod]): HTTP method to use when sending requests + request_options_provider (Optional[RequestOptionsProvider]): request option provider defining the options to set on outgoing requests + authenticator (HttpAuthenticator): Authenticator defining how to authenticate to the source + error_handler (Optional[ErrorHandler]): Error handler defining how to detect and handle errors + config (Config): The user-provided configuration as specified by the source's spec + """ + + name: str + url_base: InterpolatedString + path: InterpolatedString + config: Config + options: InitVar[Mapping[str, Any]] + http_method: Union[str, HttpMethod] = HttpMethod.GET + request_options_provider: Optional[RequestOptionsProvider] = None + authenticator: HttpAuthenticator = None + error_handler: Optional[ErrorHandler] = None + + def __post_init__(self, options: Mapping[str, Any]): + if self.request_options_provider is None: + self._request_options_provider = InterpolatedRequestOptionsProvider(config=self.config, options=options) + elif isinstance(self.request_options_provider, dict): + self._request_options_provider = InterpolatedRequestOptionsProvider(config=self.config, **self.request_options_provider) + else: + self._request_options_provider = self.request_options_provider + self.authenticator = self.authenticator or NoAuth() + if type(self.http_method) == str: + self.http_method = HttpMethod[self.http_method] + self._method = self.http_method + self.error_handler = self.error_handler or DefaultErrorHandler(options=options) + self._options = options + + # We are using an LRU cache in should_retry() method which requires all incoming arguments (including self) to be hashable. + # Dataclasses by default are not hashable, so we need to define __hash__(). Alternatively, we can set @dataclass(frozen=True), + # but this has a cascading effect where all dataclass fields must also be set to frozen. + def __hash__(self): + return hash(tuple(self.__dict__)) + + def get_authenticator(self): + return self.authenticator + + def get_url_base(self): + return self.url_base.eval(self.config) + + def get_path( + self, *, stream_state: Optional[StreamState], stream_slice: Optional[StreamSlice], next_page_token: Optional[Mapping[str, Any]] + ) -> str: + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + path = self.path.eval(self.config, **kwargs) + return path + + def get_method(self): + return self._method + + # use a tiny cache to limit the memory footprint. It doesn't have to be large because we mostly + # only care about the status of the last response received + @lru_cache(maxsize=10) + def should_retry(self, response: requests.Response) -> ResponseStatus: + # Cache the result because the HttpStream first checks if we should retry before looking at the backoff time + return self.error_handler.should_retry(response) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + return self._request_options_provider.get_request_params( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._request_options_provider.get_request_headers( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Union[Mapping, str]]: + return self._request_options_provider.get_request_body_data( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + return self._request_options_provider.get_request_body_json( + stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + + def request_kwargs( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + # todo: there are a few integrations that override the request_kwargs() method, but the use case for why kwargs over existing + # constructs is a little unclear. We may revisit this, but for now lets leave it out of the DSL + return {} + + @property + def cache_filename(self) -> str: + # FIXME: this should be declarative + return f"{self.name}.yml" + + @property + def use_cache(self) -> bool: + # FIXME: this should be declarative + return False diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py new file mode 100644 index 0000000000000..d0310b21c199c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator import LimitPaginator +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy + +__all__ = ["LimitPaginator", "NoPagination", "PaginationStrategy", "Paginator"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/limit_paginator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/limit_paginator.py new file mode 100644 index 0000000000000..bf9adcbd515b8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/limit_paginator.py @@ -0,0 +1,161 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.types import Config, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class LimitPaginator(Paginator, JsonSchemaMixin): + """ + Limit paginator to request pages of results with a fixed size until the pagination strategy no longer returns a next_page_token + + Examples: + 1. + * fetches up to 10 records at a time by setting the "limit" request param to 10 + * updates the request path with "{{ response._metadata.next }}" + paginator: + type: "LimitPaginator" + page_size: 10 + limit_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + option_type: path + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + ` + + 2. + * fetches up to 5 records at a time by setting the "page_size" header to 5 + * increments a record counter and set the request parameter "offset" to the value of the counter + ` + paginator: + type: "LimitPaginator" + page_size: 5 + limit_option: + inject_into: header + field_name: page_size + pagination_strategy: + type: "OffsetIncrement" + page_token: + option_type: "request_parameter" + field_name: "offset" + ` + + 3. + * fetches up to 5 records at a time by setting the "page_size" request param to 5 + * increments a page counter and set the request parameter "page" to the value of the counter + ` + paginator: + type: "LimitPaginator" + page_size: 5 + limit_option: + inject_into: request_parameter + field_name: page_size + pagination_strategy: + type: "PageIncrement" + page_token: + option_type: "request_parameter" + field_name: "page" + + Attributes: + page_size (int): the number of records to request + limit_option (RequestOption): the request option to set the limit. Cannot be injected in the path. + page_token_option (RequestOption): the request option to set the page token + pagination_strategy (PaginationStrategy): Strategy defining how to get the next page token + config (Config): connection config + url_base (Union[InterpolatedString, str]): endpoint's base url + decoder (Decoder): decoder to decode the response + """ + + page_size: int + limit_option: RequestOption + page_token_option: RequestOption + pagination_strategy: PaginationStrategy + config: Config + url_base: Union[InterpolatedString, str] + options: InitVar[Mapping[str, Any]] + decoder: Decoder = JsonDecoder(options={}) + _token: Optional[Any] = field(init=False, repr=False, default=None) + + def __post_init__(self, options: Mapping[str, Any]): + if self.limit_option.inject_into == RequestOptionType.path: + raise ValueError("Limit parameter cannot be a path") + if isinstance(self.url_base, str): + self.url_base = InterpolatedString(string=self.url_base, options=options) + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Mapping[str, Any]]: + self._token = self.pagination_strategy.next_page_token(response, last_records) + if self._token: + return {"next_page_token": self._token} + else: + return None + + def path(self): + if self._token and self.page_token_option.inject_into == RequestOptionType.path: + # Replace url base to only return the path + return str(self._token).replace(self.url_base.eval(self.config), "") + else: + return None + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, str]: + return self._get_request_options(RequestOptionType.header) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_data) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json) + + def reset(self): + self.pagination_strategy.reset() + + def _get_request_options(self, option_type: RequestOptionType) -> Mapping[str, Any]: + options = {} + if self.page_token_option.inject_into == option_type: + if option_type != RequestOptionType.path and self._token: + options[self.page_token_option.field_name] = self._token + if self.limit_option.inject_into == option_type: + if option_type != RequestOptionType.path: + options[self.limit_option.field_name] = self.page_size + return options diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py new file mode 100644 index 0000000000000..210b00c731236 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py @@ -0,0 +1,65 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.types import StreamSlice, StreamState + + +@dataclass +class NoPagination(Paginator): + """ + Pagination implementation that never returns a next page. + """ + + options: InitVar[Mapping[str, Any]] + + def path(self) -> Optional[str]: + return None + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, str]: + return {} + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Union[Mapping[str, Any], str]: + return {} + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Mapping[str, Any]: + return {} + + def reset(self): + # No state to reset + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py new file mode 100644 index 0000000000000..68b18307e0883 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/paginator.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider + + +@dataclass +class Paginator(RequestOptionsProvider): + """ + Defines the token to use to fetch the next page of records from the API. + + If needed, the Paginator will set request options to be set on the HTTP request to fetch the next page of records. + If the next_page_token is the path to the next page of records, then it should be accessed through the `path` method + """ + + @abstractmethod + def reset(self): + """ + Reset the pagination's inner state + """ + + @abstractmethod + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Mapping[str, Any]]: + """ + Returns the next_page_token to use to fetch the next page of records. + + :param response: the response to process + :param last_records: the records extracted from the response + :return: A mapping {"next_page_token": } for the next page from the input response object. Returning None means there are no more pages to read in this response. + """ + pass + + @abstractmethod + def path(self) -> Optional[str]: + """ + Returns the URL path to hit to fetch the next page of records + + e.g: if you wanted to hit https://myapi.com/v1/some_entity then this will return "some_entity" + + :return: path to hit to fetch the next request. Returning None means the path is not defined by the next_page_token + """ + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py new file mode 100644 index 0000000000000..4b4f9d259d9b7 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py @@ -0,0 +1,9 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement + +__all__ = ["CursorPaginationStrategy", "OffsetIncrement", "PageIncrement"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py new file mode 100644 index 0000000000000..5940936ac6485 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional, Union + +import requests +from airbyte_cdk.sources.declarative.decoders.decoder import Decoder +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from airbyte_cdk.sources.declarative.types import Config +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class CursorPaginationStrategy(PaginationStrategy, JsonSchemaMixin): + """ + Pagination strategy that evaluates an interpolated string to define the next page token + + Attributes: + cursor_value (Union[InterpolatedString, str]): template string evaluating to the cursor value + config (Config): connection config + stop_condition (Optional[InterpolatedBoolean]): template string evaluating when to stop paginating + decoder (Decoder): decoder to decode the response + """ + + cursor_value: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + stop_condition: Optional[InterpolatedBoolean] = None + decoder: Decoder = JsonDecoder(options={}) + + def __post_init__(self, options: Mapping[str, Any]): + if isinstance(self.cursor_value, str): + self.cursor_value = InterpolatedString.create(self.cursor_value, options=options) + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Any]: + decoded_response = self.decoder.decode(response) + + # The default way that link is presented in requests.Response is a string of various links (last, next, etc). This + # is not indexable or useful for parsing the cursor, so we replace it with the link dictionary from response.links + headers = response.headers + headers["link"] = response.links + + if self.stop_condition: + should_stop = self.stop_condition.eval(self.config, response=decoded_response, headers=headers, last_records=last_records) + if should_stop: + return None + token = self.cursor_value.eval(config=self.config, last_records=last_records, response=decoded_response, headers=headers) + return token if token else None + + def reset(self): + # No state to reset + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py new file mode 100644 index 0000000000000..e6ab8a03fb589 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class OffsetIncrement(PaginationStrategy, JsonSchemaMixin): + """ + Pagination strategy that returns the number of records reads so far and returns it as the next page token + + Attributes: + page_size (int): the number of records to request + """ + + page_size: int + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._offset = 0 + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Any]: + if len(last_records) < self.page_size: + return None + else: + self._offset += len(last_records) + return self._offset + + def reset(self): + self._offset = 0 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py new file mode 100644 index 0000000000000..46e112a0397f5 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.pagination_strategy import PaginationStrategy +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class PageIncrement(PaginationStrategy, JsonSchemaMixin): + """ + Pagination strategy that returns the number of pages reads so far and returns it as the next page token + + Attributes: + page_size (int): the number of records to request + """ + + page_size: int + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self._page = 0 + + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Any]: + if len(last_records) < self.page_size: + return None + else: + self._page += 1 + return self._page + + def reset(self): + self._page = 0 diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py new file mode 100644 index 0000000000000..a2d9407a833dc --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, List, Mapping, Optional + +import requests +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class PaginationStrategy(JsonSchemaMixin): + """ + Defines how to get the next page token + """ + + @abstractmethod + def next_page_token(self, response: requests.Response, last_records: List[Mapping[str, Any]]) -> Optional[Any]: + """ + :param response: response to process + :param last_records: records extracted from the response + :return: next page token. Returns None if there are no more pages to fetch + """ + pass + + @abstractmethod + def reset(self): + """ + Reset the pagination's inner state + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py new file mode 100644 index 0000000000000..1ed01f34b87c3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_option.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from enum import Enum +from typing import Any, Mapping, Optional + +from dataclasses_jsonschema import JsonSchemaMixin + + +class RequestOptionType(Enum): + """ + Describes where to set a value on a request + """ + + request_parameter = "request_parameter" + header = "header" + path = "path" + body_data = "body_data" + body_json = "body_json" + + +@dataclass +class RequestOption(JsonSchemaMixin): + """ + Describes an option to set on a request + + Attributes: + inject_into (RequestOptionType): Describes where in the HTTP request to inject the parameter + field_name (Optional[str]): Describes the name of the parameter to inject. None if option_type == path. Required otherwise. + """ + + inject_into: RequestOptionType + options: InitVar[Mapping[str, Any]] + field_name: Optional[str] = None + + def __post_init__(self, options: Mapping[str, Any]): + if self.inject_into == RequestOptionType.path: + if self.field_name is not None: + raise ValueError(f"RequestOption with path cannot have a field name. Get {self.field_name}") + elif self.field_name is None: + raise ValueError(f"RequestOption expected field name for type {self.inject_into}") + + def is_path(self) -> bool: + """Returns true if the parameter is the path to send the request to""" + return self.inject_into == RequestOptionType.path diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py new file mode 100644 index 0000000000000..9bb93d757d126 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/__init__.py @@ -0,0 +1,10 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider + +__all__ = ["InterpolatedRequestOptionsProvider", "RequestOptionsProvider"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py new file mode 100644 index 0000000000000..e3bf9e0f0e19c --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py @@ -0,0 +1,52 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Mapping, Optional, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.types import Config, StreamSlice, StreamState + + +class InterpolatedRequestInputProvider: + """ + Helper class that generically performs string interpolation on the provided dictionary or string input + """ + + def __init__( + self, *, config: Config, request_inputs: Optional[Union[str, Mapping[str, str]]] = None, **options: Optional[Mapping[str, Any]] + ): + """ + :param config: The user-provided configuration as specified by the source's spec + :param request_inputs: The dictionary to interpolate + :param options: Additional runtime parameters to be used for string interpolation + """ + + self._config = config + + if request_inputs is None: + request_inputs = {} + if isinstance(request_inputs, str): + self._interpolator = InterpolatedString(request_inputs, default="", options=options) + else: + self._interpolator = InterpolatedMapping(request_inputs, options=options) + + def request_inputs( + self, stream_state: StreamState, stream_slice: Optional[StreamSlice] = None, next_page_token: Mapping[str, Any] = None + ) -> Mapping[str, Any]: + """ + Returns the request inputs to set on an outgoing HTTP request + + :param stream_state: The stream state + :param stream_slice: The stream slice + :param next_page_token: The pagination token + :return: The request inputs to set on an outgoing HTTP request + """ + kwargs = {"stream_state": stream_state, "stream_slice": stream_slice, "next_page_token": next_page_token} + interpolated_value = self._interpolator.eval(self._config, **kwargs) + + if isinstance(interpolated_value, dict): + non_null_tokens = {k: v for k, v in interpolated_value.items() if v} + return non_null_tokens + return interpolated_value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py new file mode 100644 index 0000000000000..6348ccc35884b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_input_provider import InterpolatedRequestInputProvider +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.declarative.types import Config, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + +RequestInput = Union[str, Mapping[str, str]] + + +@dataclass +class InterpolatedRequestOptionsProvider(RequestOptionsProvider, JsonSchemaMixin): + """ + Defines the request options to set on an outgoing HTTP request by evaluating `InterpolatedMapping`s + + Attributes: + config (Config): The user-provided configuration as specified by the source's spec + request_parameters (Union[str, Mapping[str, str]]): The request parameters to set on an outgoing HTTP request + request_headers (Union[str, Mapping[str, str]]): The request headers to set on an outgoing HTTP request + request_body_data (Union[str, Mapping[str, str]]): The body data to set on an outgoing HTTP request + request_body_json (Union[str, Mapping[str, str]]): The json content to set on an outgoing HTTP request + """ + + options: InitVar[Mapping[str, Any]] + config: Config = field(default_factory=dict) + request_parameters: Optional[RequestInput] = None + request_headers: Optional[RequestInput] = None + request_body_data: Optional[RequestInput] = None + request_body_json: Optional[RequestInput] = None + + def __post_init__(self, options: Mapping[str, Any]): + if self.request_parameters is None: + self.request_parameters = {} + if self.request_headers is None: + self.request_headers = {} + if self.request_body_data is None: + self.request_body_data = {} + if self.request_body_json is None: + self.request_body_json = {} + + if self.request_body_json and self.request_body_data: + raise ValueError("RequestOptionsProvider should only contain either 'request_body_data' or 'request_body_json' not both") + + self._parameter_interpolator = InterpolatedRequestInputProvider(config=self.config, request_inputs=self.request_parameters) + self._headers_interpolator = InterpolatedRequestInputProvider(config=self.config, request_inputs=self.request_headers) + self._body_data_interpolator = InterpolatedRequestInputProvider(config=self.config, request_inputs=self.request_body_data) + self._body_json_interpolator = InterpolatedRequestInputProvider(config=self.config, request_inputs=self.request_body_json) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + interpolated_value = self._parameter_interpolator.request_inputs(stream_state, stream_slice, next_page_token) + if isinstance(interpolated_value, dict): + return interpolated_value + return {} + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._headers_interpolator.request_inputs(stream_state, stream_slice, next_page_token) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Union[Mapping, str]]: + return self._body_data_interpolator.request_inputs(stream_state, stream_slice, next_page_token) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + return self._body_json_interpolator.request_inputs(stream_state, stream_slice, next_page_token) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py new file mode 100644 index 0000000000000..1be5fa690349f --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py @@ -0,0 +1,79 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Mapping, MutableMapping, Optional, Union + +from airbyte_cdk.sources.declarative.types import StreamSlice, StreamState + + +@dataclass +class RequestOptionsProvider(ABC): + """ + Defines the request options to set on an outgoing HTTP request + + Options can be passed by + - request parameter + - request headers + - body data + - json content + """ + + @abstractmethod + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + pass + + @abstractmethod + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method.""" + + @abstractmethod + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Union[Mapping, str]]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py new file mode 100644 index 0000000000000..24c4211df5ed8 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/requesters/requester.py @@ -0,0 +1,153 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from enum import Enum +from typing import Any, Mapping, MutableMapping, Optional + +import requests +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.declarative.types import StreamSlice, StreamState +from requests.auth import AuthBase + + +class HttpMethod(Enum): + """ + Http Method to use when submitting an outgoing HTTP request + """ + + GET = "GET" + POST = "POST" + + +class Requester(RequestOptionsProvider): + @abstractmethod + def get_authenticator(self) -> AuthBase: + """ + Specifies the authenticator to use when submitting requests + """ + pass + + @abstractmethod + def get_url_base(self) -> str: + """ + :return: URL base for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "https://myapi.com/v1/" + """ + + @abstractmethod + def get_path( + self, + *, + stream_state: Optional[StreamState], + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + ) -> str: + """ + Returns the URL path for the API endpoint e.g: if you wanted to hit https://myapi.com/v1/some_entity then this should return "some_entity" + """ + + @abstractmethod + def get_method(self) -> HttpMethod: + """ + Specifies the HTTP method to use + """ + + @abstractmethod + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + + @abstractmethod + def should_retry(self, response: requests.Response) -> ResponseStatus: + """ + Specifies conditions for backoff based on the response from the server. + + By default, back off on the following HTTP response statuses: + - 429 (Too Many Requests) indicating rate limiting + - 500s to handle transient server errors + + Unexpected but transient exceptions (connection timeout, DNS resolution failed, etc..) are retried by default. + """ + + @abstractmethod + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Return any non-auth headers. Authentication headers will overwrite any overlapping headers returned from this method. + """ + + @abstractmethod + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping[str, Any]]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + + @abstractmethod + def request_kwargs( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Returns a mapping of keyword arguments to be used when creating the HTTP request. + Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from + this method. Note that these options do not conflict with request-level options such as headers, request params, etc.. + """ + + @property + @abstractmethod + def cache_filename(self) -> str: + """ + Return the name of cache file + """ + + @property + @abstractmethod + def use_cache(self) -> bool: + """ + If True, all records will be cached. + """ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py new file mode 100644 index 0000000000000..9c47818b3e725 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever + +__all__ = ["Retriever", "SimpleRetriever"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py new file mode 100644 index 0000000000000..a9ae02806425a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/retriever.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Iterable, List, Optional + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState + + +@dataclass +class Retriever(ABC): + """ + Responsible for fetching a stream's records from an HTTP API source. + """ + + @abstractmethod + def read_records( + self, + sync_mode: SyncMode, + cursor_field: Optional[List[str]] = None, + stream_slice: Optional[StreamSlice] = None, + stream_state: Optional[StreamState] = None, + ) -> Iterable[Record]: + """ + Fetch a stream's records from an HTTP API source + + :param sync_mode: Unused but currently necessary for integrating with HttpStream + :param cursor_field: Unused but currently necessary for integrating with HttpStream + :param stream_slice: The stream slice to read data for + :param stream_state: The initial stream state + :return: The records read from the API source + """ + + @abstractmethod + def stream_slices(self, *, sync_mode: SyncMode, stream_state: Optional[StreamState] = None) -> Iterable[Optional[StreamSlice]]: + """Returns the stream slices""" + + @property + @abstractmethod + def state(self) -> StreamState: + """State getter, should return state in form that can serialized to a string and send to the output + as a STATE AirbyteMessage. + + A good example of a state is a cursor_value: + { + self.cursor_field: "cursor_value" + } + + State should try to be as small as possible but at the same time descriptive enough to restore + syncing process from the point where it stopped. + """ + + @state.setter + @abstractmethod + def state(self, value: StreamState): + """State setter, accept state serialized by state getter.""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py new file mode 100644 index 0000000000000..8443026161d51 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py @@ -0,0 +1,377 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union + +import requests +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.extractors.http_selector import HttpSelector +from airbyte_cdk.sources.declarative.read_exception import ReadException +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination +from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator +from airbyte_cdk.sources.declarative.requesters.requester import Requester +from airbyte_cdk.sources.declarative.retrievers.retriever import Retriever +from airbyte_cdk.sources.declarative.stream_slicers.single_slice import SingleSlice +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState +from airbyte_cdk.sources.streams.http import HttpStream +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class SimpleRetriever(Retriever, HttpStream, JsonSchemaMixin): + """ + Retrieves records by synchronously sending requests to fetch records. + + The retriever acts as an orchestrator between the requester, the record selector, the paginator, and the stream slicer. + + For each stream slice, submit requests until there are no more pages of records to fetch. + + This retriever currently inherits from HttpStream to reuse the request submission and pagination machinery. + As a result, some of the parameters passed to some methods are unused. + The two will be decoupled in a future release. + + Attributes: + stream_name (str): The stream's name + stream_primary_key (Optional[Union[str, List[str], List[List[str]]]]): The stream's primary key + requester (Requester): The HTTP requester + record_selector (HttpSelector): The record selector + paginator (Optional[Paginator]): The paginator + stream_slicer (Optional[StreamSlicer]): The stream slicer + options (Mapping[str, Any]): Additional runtime parameters to be used for string interpolation + """ + + requester: Requester + record_selector: HttpSelector + options: InitVar[Mapping[str, Any]] + name: str + _name: str = field(init=False, repr=False) + primary_key: Optional[Union[str, List[str], List[List[str]]]] + _primary_key: str = field(init=False, repr=False) + paginator: Optional[Paginator] = None + stream_slicer: Optional[StreamSlicer] = SingleSlice(options={}) + + def __post_init__(self, options: Mapping[str, Any]): + self.paginator = self.paginator or NoPagination(options=options) + HttpStream.__init__(self, self.requester.get_authenticator()) + self._last_response = None + self._last_records = None + + @property + def name(self) -> str: + """ + :return: Stream name + """ + return self._name + + @name.setter + def name(self, value: str) -> None: + if not isinstance(value, property): + self._name = value + + @property + def url_base(self) -> str: + return self.requester.get_url_base() + + @property + def http_method(self) -> str: + return str(self.requester.get_method().value) + + @property + def raise_on_http_errors(self) -> bool: + # never raise on http_errors because this overrides the error handler logic... + return False + + def should_retry(self, response: requests.Response) -> bool: + """ + Specifies conditions for backoff based on the response from the server. + + By default, back off on the following HTTP response statuses: + - 429 (Too Many Requests) indicating rate limiting + - 500s to handle transient server errors + + Unexpected but transient exceptions (connection timeout, DNS resolution failed, etc..) are retried by default. + """ + return self.requester.should_retry(response).action == ResponseAction.RETRY + + def backoff_time(self, response: requests.Response) -> Optional[float]: + """ + Specifies backoff time. + + This method is called only if should_backoff() returns True for the input request. + + :param response: + :return how long to backoff in seconds. The return value may be a floating point number for subsecond precision. Returning None defers backoff + to the default backoff behavior (e.g using an exponential algorithm). + """ + should_retry = self.requester.should_retry(response) + if should_retry.action != ResponseAction.RETRY: + raise ValueError(f"backoff_time can only be applied on retriable response action. Got {should_retry.action}") + assert should_retry.action == ResponseAction.RETRY + return should_retry.retry_in + + def _get_request_options( + self, + stream_slice: Optional[StreamSlice], + next_page_token: Optional[Mapping[str, Any]], + requester_method, + paginator_method, + stream_slicer_method, + ): + """ + Get the request_option from the requester and from the paginator + Raise a ValueError if there's a key collision + Returned merged mapping otherwise + :param stream_slice: + :param next_page_token: + :param requester_method: + :param paginator_method: + :return: + """ + + requester_mapping = requester_method(stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token) + requester_mapping_keys = set(requester_mapping.keys()) + paginator_mapping = paginator_method(stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token) + paginator_mapping_keys = set(paginator_mapping.keys()) + stream_slicer_mapping = stream_slicer_method(stream_slice=stream_slice) + stream_slicer_mapping_keys = set(stream_slicer_mapping.keys()) + + intersection = ( + (requester_mapping_keys & paginator_mapping_keys) + | (requester_mapping_keys & stream_slicer_mapping_keys) + | (paginator_mapping_keys & stream_slicer_mapping_keys) + ) + if intersection: + raise ValueError(f"Duplicate keys found: {intersection}") + return {**requester_mapping, **paginator_mapping, **stream_slicer_mapping} + + def request_headers( + self, stream_state: StreamState, stream_slice: Optional[StreamSlice] = None, next_page_token: Optional[Mapping[str, Any]] = None + ) -> Mapping[str, Any]: + """ + Specifies request headers. + Authentication headers will overwrite any overlapping headers returned from this method. + """ + headers = self._get_request_options( + stream_slice, + next_page_token, + self.requester.get_request_headers, + self.paginator.get_request_headers, + self.stream_slicer.get_request_headers, + ) + return {str(k): str(v) for k, v in headers.items()} + + def request_params( + self, + stream_state: StreamSlice, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> MutableMapping[str, Any]: + """ + Specifies the query parameters that should be set on an outgoing HTTP request given the inputs. + + E.g: you might want to define query parameters for paging if next_page_token is not None. + """ + return self._get_request_options( + stream_slice, + next_page_token, + self.requester.get_request_params, + self.paginator.get_request_params, + self.stream_slicer.get_request_params, + ) + + def request_body_data( + self, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Union[Mapping, str]]: + """ + Specifies how to populate the body of the request with a non-JSON payload. + + If returns a ready text that it will be sent as is. + If returns a dict that it will be converted to a urlencoded form. + E.g. {"key1": "value1", "key2": "value2"} => "key1=value1&key2=value2" + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + # Warning: use self.state instead of the stream_state passed as argument! + base_body_data = self.requester.get_request_body_data( + stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token + ) + if isinstance(base_body_data, str): + paginator_body_data = self.paginator.get_request_body_data() + if paginator_body_data: + raise ValueError( + f"Cannot combine requester's body data= {base_body_data} with paginator's body_data: {paginator_body_data}" + ) + else: + return base_body_data + return self._get_request_options( + stream_slice, + next_page_token, + self.requester.get_request_body_data, + self.paginator.get_request_body_data, + self.stream_slicer.get_request_body_data, + ) + + def request_body_json( + self, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + """ + Specifies how to populate the body of the request with a JSON payload. + + At the same time only one of the 'request_body_data' and 'request_body_json' functions can be overridden. + """ + # Warning: use self.state instead of the stream_state passed as argument! + return self._get_request_options( + stream_slice, + next_page_token, + self.requester.get_request_body_json, + self.paginator.get_request_body_json, + self.stream_slicer.get_request_body_json, + ) + + def request_kwargs( + self, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + """ + Specifies how to configure a mapping of keyword arguments to be used when creating the HTTP request. + Any option listed in https://docs.python-requests.org/en/latest/api/#requests.adapters.BaseAdapter.send for can be returned from + this method. Note that these options do not conflict with request-level options such as headers, request params, etc.. + """ + # Warning: use self.state instead of the stream_state passed as argument! + return self.requester.request_kwargs(stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token) + + def path( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> str: + """ + Return the path the submit the next request to. + If the paginator points to a path, follow it, else return the requester's path + :param stream_state: + :param stream_slice: + :param next_page_token: + :return: + """ + # Warning: use self.state instead of the stream_state passed as argument! + paginator_path = self.paginator.path() + if paginator_path: + return paginator_path + else: + return self.requester.get_path(stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token) + + @property + def cache_filename(self) -> str: + """ + Return the name of cache file + """ + return self.requester.cache_filename + + @property + def use_cache(self) -> bool: + """ + If True, all records will be cached. + """ + return self.requester.use_cache + + def parse_response( + self, + response: requests.Response, + *, + stream_state: StreamState, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Iterable[Record]: + # if fail -> raise exception + # if ignore -> ignore response and return no records + # else -> delegate to record selector + response_status = self.requester.should_retry(response) + if response_status.action == ResponseAction.FAIL: + raise ReadException(f"Request {response.request} failed with response {response}") + elif response_status.action == ResponseAction.IGNORE: + self.logger.info(f"Ignoring response for failed request with error message {HttpStream.parse_response_error_message(response)}") + return [] + + # Warning: use self.state instead of the stream_state passed as argument! + self._last_response = response + records = self.record_selector.select_records( + response=response, stream_state=self.state, stream_slice=stream_slice, next_page_token=next_page_token + ) + self._last_records = records + return records + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + """The stream's primary key""" + return self._primary_key + + @primary_key.setter + def primary_key(self, value: str) -> None: + if not isinstance(value, property): + self._primary_key = value + + def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: + """ + Specifies a pagination strategy. + + The value returned from this method is passed to most other methods in this class. Use it to form a request e.g: set headers or query params. + + :return: The token for the next page from the input response object. Returning None means there are no more pages to read in this response. + """ + return self.paginator.next_page_token(response, self._last_records) + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Optional[StreamSlice] = None, + stream_state: Optional[StreamState] = None, + ) -> Iterable[Mapping[str, Any]]: + # Warning: use self.state instead of the stream_state passed as argument! + stream_slice = stream_slice or {} # None-check + self.paginator.reset() + records_generator = HttpStream.read_records(self, sync_mode, cursor_field, stream_slice, self.state) + for r in records_generator: + self.stream_slicer.update_cursor(stream_slice, last_record=r) + yield r + else: + last_record = self._last_records[-1] if self._last_records else None + self.stream_slicer.update_cursor(stream_slice, last_record=last_record) + yield from [] + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Optional[StreamState] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + """ + Specifies the slices for this stream. See the stream slicing section of the docs for more information. + + :param sync_mode: + :param cursor_field: + :param stream_state: + :return: + """ + # Warning: use self.state instead of the stream_state passed as argument! + return self.stream_slicer.stream_slices(sync_mode, self.state) + + @property + def state(self) -> MutableMapping[str, Any]: + return self.stream_slicer.get_stream_state() + + @state.setter + def state(self, value: StreamState): + """State setter, accept state serialized by state getter.""" + self.stream_slicer.update_cursor(value) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py new file mode 100644 index 0000000000000..cbef6eb1d268b --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.schema.json_schema import JsonSchema +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader + +__all__ = ["JsonSchema", "SchemaLoader"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py new file mode 100644 index 0000000000000..e3a42dd04f17a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/json_schema.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json +from dataclasses import InitVar, dataclass +from typing import Any, Mapping, Union + +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.schema.schema_loader import SchemaLoader +from airbyte_cdk.sources.declarative.types import Config +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class JsonSchema(SchemaLoader, JsonSchemaMixin): + """ + Loads the schema from a json file + + Attributes: + file_path (Union[InterpolatedString, str]): The path to the json file describing the schema + name (str): The stream's name + config (Config): The user-provided configuration as specified by the source's spec + options (Mapping[str, Any]): Additional arguments to pass to the string interpolation if needed + """ + + file_path: Union[InterpolatedString, str] + name: str + config: Config + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + self.file_path = InterpolatedString.create(self.file_path, options=options) + + def get_json_schema(self) -> Mapping[str, Any]: + json_schema_path = self._get_json_filepath() + with open(json_schema_path, "r") as f: + return json.loads(f.read()) + + def _get_json_filepath(self): + return self.file_path.eval(self.config) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py new file mode 100644 index 0000000000000..3a0d45316a4e3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/schema/schema_loader.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any, Mapping + + +@dataclass +class SchemaLoader(ABC): + """Describes a stream's schema""" + + @abstractmethod + def get_json_schema(self) -> Mapping[str, Any]: + """Returns a mapping describing the stream's schema""" + pass diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py new file mode 100644 index 0000000000000..5fcd546f87bd4 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/__init__.py @@ -0,0 +1,12 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer import CartesianProductStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer import DatetimeStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer import ListStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.single_slice import SingleSlice +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.substream_slicer import SubstreamSlicer + +__all__ = ["CartesianProductStreamSlicer", "DatetimeStreamSlicer", "ListStreamSlicer", "SingleSlice", "StreamSlicer", "SubstreamSlicer"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py new file mode 100644 index 0000000000000..de6c745ef3520 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py @@ -0,0 +1,113 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import itertools +from collections import ChainMap +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, List, Mapping, Optional + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class CartesianProductStreamSlicer(StreamSlicer, JsonSchemaMixin): + """ + Stream slicers that iterates over the cartesian product of input stream slicers + Given 2 stream slicers with the following slices: + A: [{"i": 0}, {"i": 1}, {"i": 2}] + B: [{"s": "hello"}, {"s": "world"}] + the resulting stream slices are + [ + {"i": 0, "s": "hello"}, + {"i": 0, "s": "world"}, + {"i": 1, "s": "hello"}, + {"i": 1, "s": "world"}, + {"i": 2, "s": "hello"}, + {"i": 2, "s": "world"}, + ] + + Attributes: + stream_slicers (List[StreamSlicer]): Underlying stream slicers. The RequestOptions (e.g: Request headers, parameters, etc..) returned by this slicer are the combination of the RequestOptions of its input slicers. If there are conflicts e.g: two slicers define the same header or request param, the conflict is resolved by taking the value from the first slicer, where ordering is determined by the order in which slicers were input to this composite slicer. + """ + + stream_slicers: List[StreamSlicer] + options: InitVar[Mapping[str, Any]] + + def update_cursor(self, stream_slice: Mapping[str, Any], last_record: Optional[Mapping[str, Any]] = None): + for slicer in self.stream_slicers: + slicer.update_cursor(stream_slice, last_record) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ + s.get_request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ + s.get_request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return dict( + ChainMap( + *[ + s.get_request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + return dict( + ChainMap( + *[ + s.get_request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + for s in self.stream_slicers + ] + ) + ) + + def get_stream_state(self) -> Mapping[str, Any]: + return dict(ChainMap(*[slicer.get_stream_state() for slicer in self.stream_slicers])) + + def stream_slices(self, sync_mode: SyncMode, stream_state: Mapping[str, Any]) -> Iterable[Mapping[str, Any]]: + sub_slices = (s.stream_slices(sync_mode, stream_state) for s in self.stream_slicers) + return (ChainMap(*a) for a in itertools.product(*sub_slices)) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/datetime_stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/datetime_stream_slicer.py new file mode 100644 index 0000000000000..c81d11e851298 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/datetime_stream_slicer.py @@ -0,0 +1,243 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime +import re +from dataclasses import InitVar, dataclass, field +from typing import Any, Iterable, Mapping, Optional, Union + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class DatetimeStreamSlicer(StreamSlicer, JsonSchemaMixin): + """ + Slices the stream over a datetime range. + + Given a start time, end time, a step function, and an optional lookback window, + the stream slicer will partition the date range from start time - lookback window to end time. + + The step function is defined as a string of the form: + `""` + + where unit can be one of + - weeks, w + - days, d + + For example, "1d" will produce windows of 1 day, and 2weeks windows of 2 weeks. + + The timestamp format accepts the same format codes as datetime.strfptime, which are + all the format codes required by the 1989 C standard. + Full list of accepted format codes: https://man7.org/linux/man-pages/man3/strftime.3.html + + Attributes: + start_datetime (MinMaxDatetime): the datetime that determines the earliest record that should be synced + end_datetime (MinMaxDatetime): the datetime that determines the last record that should be synced + step (str): size of the timewindow + cursor_field (InterpolatedString): record's cursor field + datetime_format (str): format of the datetime + config (Config): connection config + start_time_option (Optional[RequestOption]): request option for start time + end_time_option (Optional[RequestOption]): request option for end time + stream_state_field_start (Optional[str]): stream slice start time field + stream_state_field_end (Optional[str]): stream slice end time field + lookback_window (Optional[InterpolatedString]): how many days before start_datetime to read data for + """ + + start_datetime: MinMaxDatetime + end_datetime: MinMaxDatetime + step: str + cursor_field: InterpolatedString + datetime_format: str + config: Config + options: InitVar[Mapping[str, Any]] + _cursor: dict = field(repr=False, default=None) # tracks current datetime + _cursor_end: dict = field(repr=False, default=None) # tracks end of current stream slice + start_time_option: Optional[RequestOption] = None + end_time_option: Optional[RequestOption] = None + stream_state_field_start: Optional[str] = None + stream_state_field_end: Optional[str] = None + lookback_window: Optional[InterpolatedString] = None + + timedelta_regex = re.compile(r"((?P[\.\d]+?)w)?" r"((?P[\.\d]+?)d)?$") + + def __post_init__(self, options: Mapping[str, Any]): + self._timezone = datetime.timezone.utc + self._interpolation = JinjaInterpolation() + + self._step = self._parse_timedelta(self.step) + self.cursor_field = InterpolatedString.create(self.cursor_field, options=options) + self.stream_slice_field_start = InterpolatedString.create(self.stream_state_field_start or "start_time", options=options) + self.stream_slice_field_end = InterpolatedString.create(self.stream_state_field_end or "end_time", options=options) + + # If datetime format is not specified then start/end datetime should inherit it from the stream slicer + if not self.start_datetime.datetime_format: + self.start_datetime.datetime_format = self.datetime_format + if not self.end_datetime.datetime_format: + self.end_datetime.datetime_format = self.datetime_format + + if self.start_time_option and self.start_time_option.inject_into == RequestOptionType.path: + raise ValueError("Start time cannot be passed by path") + if self.end_time_option and self.end_time_option.inject_into == RequestOptionType.path: + raise ValueError("End time cannot be passed by path") + + def get_stream_state(self) -> StreamState: + return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} + + def update_cursor(self, stream_slice: StreamSlice, last_record: Optional[Record] = None): + """ + Update the cursor value to the max datetime between the last record, the start of the stream_slice, and the current cursor value. + Update the cursor_end value with the stream_slice's end time. + + :param stream_slice: current stream slice + :param last_record: last record read + :return: None + """ + stream_slice_value = stream_slice.get(self.cursor_field.eval(self.config)) + stream_slice_value_end = stream_slice.get(self.stream_slice_field_end.eval(self.config)) + last_record_value = last_record.get(self.cursor_field.eval(self.config)) if last_record else None + cursor = None + if stream_slice_value and last_record_value: + cursor = max(stream_slice_value, last_record_value) + elif stream_slice_value: + cursor = stream_slice_value + else: + cursor = last_record_value + if self._cursor and cursor: + self._cursor = max(cursor, self._cursor) + elif cursor: + self._cursor = cursor + if self.stream_slice_field_end: + self._cursor_end = stream_slice_value_end + + def stream_slices(self, sync_mode: SyncMode, stream_state: Mapping[str, Any]) -> Iterable[Mapping[str, Any]]: + """ + Partition the daterange into slices of size = step. + + The start of the window is the minimum datetime between start_datetime - looback_window and the stream_state's datetime + The end of the window is the minimum datetime between the start of the window and end_datetime. + + :param sync_mode: + :param stream_state: current stream state. If set, the start_date will be the day following the stream_state. + :return: + """ + stream_state = stream_state or {} + kwargs = {"stream_state": stream_state} + end_datetime = min(self.end_datetime.get_datetime(self.config, **kwargs), datetime.datetime.now(tz=self._timezone)) + lookback_delta = self._parse_timedelta(self.lookback_window.eval(self.config, **kwargs) if self.lookback_window else "0d") + start_datetime = self.start_datetime.get_datetime(self.config, **kwargs) - lookback_delta + start_datetime = min(start_datetime, end_datetime) + if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state: + cursor_datetime = self.parse_date(stream_state[self.cursor_field.eval(self.config)]) + else: + cursor_datetime = start_datetime + + start_datetime = max(cursor_datetime, start_datetime) + + state_date = self.parse_date(stream_state.get(self.cursor_field.eval(self.config, stream_state=stream_state))) + if state_date: + # If the input_state's date is greater than start_datetime, the start of the time window is the state's next day + next_date = state_date + datetime.timedelta(days=1) + start_datetime = max(start_datetime, next_date) + dates = self._partition_daterange(start_datetime, end_datetime, self._step) + return dates + + def _format_datetime(self, dt: datetime.datetime): + # strftime("%s") is unreliable because it ignores the time zone information and assumes the time zone of the system it's running on + # It's safer to use the timestamp() method than the %s directive + # See https://stackoverflow.com/a/4974930 + if self.datetime_format == "%s": + return str(int(dt.timestamp())) + else: + return dt.strftime(self.datetime_format) + + def _partition_daterange(self, start, end, step: datetime.timedelta): + start_field = self.stream_slice_field_start.eval(self.config) + end_field = self.stream_slice_field_end.eval(self.config) + dates = [] + while start <= end: + end_date = self._get_date(start + step - datetime.timedelta(days=1), end, min) + dates.append({start_field: self._format_datetime(start), end_field: self._format_datetime(end_date)}) + start += step + return dates + + def _get_date(self, cursor_value, default_date: datetime.datetime, comparator) -> datetime.datetime: + cursor_date = self.parse_date(cursor_value or default_date) + return comparator(cursor_date, default_date) + + def parse_date(self, date: Union[str, datetime.datetime]) -> datetime.datetime: + if isinstance(date, str): + return datetime.datetime.strptime(str(date), self.datetime_format).replace(tzinfo=self._timezone) + else: + return date + + @classmethod + def _parse_timedelta(cls, time_str): + """ + Parse a time string e.g. (2h13m) into a timedelta object. + Modified from virhilo's answer at https://stackoverflow.com/a/4628148/851699 + :param time_str: A string identifying a duration. (eg. 2h13m) + :return datetime.timedelta: A datetime.timedelta object + """ + parts = cls.timedelta_regex.match(time_str) + + assert parts is not None + + time_params = {name: float(param) for name, param in parts.groupdict().items() if param} + return datetime.timedelta(**time_params) + + def get_request_params( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.request_parameter, stream_slice) + + def get_request_headers( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.header, stream_slice) + + def get_request_body_data( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_data, stream_slice) + + def get_request_body_json( + self, + *, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_options(RequestOptionType.body_json, stream_slice) + + def request_kwargs(self) -> Mapping[str, Any]: + # Never update kwargs + return {} + + def _get_request_options(self, option_type: RequestOptionType, stream_slice: StreamSlice): + options = {} + if self.start_time_option and self.start_time_option.inject_into == option_type: + options[self.start_time_option.field_name] = stream_slice.get(self.stream_slice_field_start.eval(self.config)) + if self.end_time_option and self.end_time_option.inject_into == option_type: + options[self.end_time_option.field_name] = stream_slice.get(self.stream_slice_field_end.eval(self.config)) + return options diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/list_stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/list_stream_slicer.py new file mode 100644 index 0000000000000..ac83d1a967cf9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/list_stream_slicer.py @@ -0,0 +1,92 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, List, Mapping, Optional, Union + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class ListStreamSlicer(StreamSlicer, JsonSchemaMixin): + """ + Stream slicer that iterates over the values of a list + If slice_values is a string, then evaluate it as literal and assert the resulting literal is a list + + Attributes: + slice_values (Union[str, List[str]]): The values to iterate over + cursor_field (Union[InterpolatedString, str]): The name of the cursor field + config (Config): The user-provided configuration as specified by the source's spec + request_option (Optional[RequestOption]): The request option to configure the HTTP request + """ + + slice_values: Union[str, List[str]] + cursor_field: Union[InterpolatedString, str] + config: Config + options: InitVar[Mapping[str, Any]] + request_option: Optional[RequestOption] = None + + def __post_init__(self, options: Mapping[str, Any]): + if isinstance(self.slice_values, str): + self.slice_values = InterpolatedString.create(self.slice_values, options=options).eval(self.config) + if isinstance(self.cursor_field, str): + self.cursor_field = InterpolatedString(string=self.cursor_field, options=options) + + if self.request_option and self.request_option.inject_into == RequestOptionType.path: + raise ValueError("Slice value cannot be injected in the path") + self._cursor = None + + def update_cursor(self, stream_slice: StreamSlice, last_record: Optional[Record] = None): + slice_value = stream_slice.get(self.cursor_field.eval(self.config)) + if slice_value and slice_value in self.slice_values: + self._cursor = slice_value + + def get_stream_state(self) -> StreamState: + return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.request_parameter) + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.header) + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.body_data) + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.body_json) + + def stream_slices(self, sync_mode: SyncMode, stream_state: Mapping[str, Any]) -> Iterable[Mapping[str, Any]]: + return [{self.cursor_field.eval(self.config): slice_value} for slice_value in self.slice_values] + + def _get_request_option(self, request_option_type: RequestOptionType): + if self.request_option and self.request_option.inject_into == request_option_type: + return {self.request_option.field_name: self._cursor} + else: + return {} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/single_slice.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/single_slice.py new file mode 100644 index 0000000000000..532982de9d088 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/single_slice.py @@ -0,0 +1,59 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, Mapping, Optional + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class SingleSlice(StreamSlicer, JsonSchemaMixin): + """Stream slicer returning only a single stream slice""" + + options: InitVar[Mapping[str, Any]] + + def update_cursor(self, stream_slice: StreamSlice, last_record: Optional[Record] = None): + pass + + def get_stream_state(self) -> StreamState: + return {} + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return {} + + def stream_slices(self, sync_mode: SyncMode, stream_state: Mapping[str, Any]) -> Iterable[StreamSlice]: + return [dict()] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py new file mode 100644 index 0000000000000..4ff22ce12c611 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py @@ -0,0 +1,46 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from dataclasses import dataclass +from typing import Iterable, Optional + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider import RequestOptionsProvider +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState + + +@dataclass +class StreamSlicer(RequestOptionsProvider): + """ + Slices the stream into a subset of records. + Slices enable state checkpointing and data retrieval parallelization. + + The stream slicer keeps track of the cursor state as a dict of cursor_field -> cursor_value + + See the stream slicing section of the docs for more information. + """ + + @abstractmethod + def stream_slices(self, sync_mode: SyncMode, stream_state: StreamState) -> Iterable[StreamSlice]: + """ + Defines stream slices + + :param sync_mode: The sync mode used the read data + :param stream_state: The current stream state + :return: List of stream slices + """ + + @abstractmethod + def update_cursor(self, stream_slice: StreamSlice, last_record: Optional[Record] = None): + """ + State setter, accept state serialized by state getter. + + :param stream_slice: Current stream_slice + :param last_record: Last record read from the source + """ + + @abstractmethod + def get_stream_state(self) -> StreamState: + """Returns the current stream state""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/substream_slicer.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/substream_slicer.py new file mode 100644 index 0000000000000..d5b8b306b86dd --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/stream_slicers/substream_slicer.py @@ -0,0 +1,141 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, Iterable, List, Mapping, Optional + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer +from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState +from airbyte_cdk.sources.streams.core import Stream +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class ParentStreamConfig: + """ + Describes how to create a stream slice from a parent stream + + stream: The stream to read records from + parent_key: The key of the parent stream's records that will be the stream slice key + stream_slice_field: The stream slice key + request_option: How to inject the slice value on an outgoing HTTP request + """ + + stream: Stream + parent_key: str + stream_slice_field: str + options: InitVar[Mapping[str, Any]] + request_option: Optional[RequestOption] = None + + +@dataclass +class SubstreamSlicer(StreamSlicer, JsonSchemaMixin): + """ + Stream slicer that iterates over the parent's stream slices and records and emits slices by interpolating the slice_definition mapping + Will populate the state with `parent_stream_slice` and `parent_record` so they can be accessed by other components + + Attributes: + parent_stream_configs (List[ParentStreamConfig]): parent streams to iterate over and their config + """ + + parent_stream_configs: List[ParentStreamConfig] + options: InitVar[Mapping[str, Any]] + + def __post_init__(self, options: Mapping[str, Any]): + if not self.parent_stream_configs: + raise ValueError("SubstreamSlicer needs at least 1 parent stream") + self._cursor = None + self._options = options + + def update_cursor(self, stream_slice: StreamSlice, last_record: Optional[Record] = None): + cursor = {} + for parent_stream_config in self.parent_stream_configs: + slice_value = stream_slice.get(parent_stream_config.stream_slice_field) + if slice_value: + cursor.update({parent_stream_config.stream_slice_field: slice_value}) + self._cursor = cursor + + def get_request_params( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.request_parameter) + + def get_request_headers( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.header) + + def get_request_body_data( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Mapping[str, Any]: + return self._get_request_option(RequestOptionType.body_data) + + def get_request_body_json( + self, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + next_page_token: Optional[Mapping[str, Any]] = None, + ) -> Optional[Mapping]: + return self._get_request_option(RequestOptionType.body_json) + + def _get_request_option(self, option_type: RequestOptionType): + params = {} + for parent_config in self.parent_stream_configs: + if parent_config.request_option and parent_config.request_option.inject_into == option_type: + key = parent_config.stream_slice_field + value = self._cursor.get(key) + if value: + params.update({key: value}) + return params + + def get_stream_state(self) -> StreamState: + return self._cursor if self._cursor else {} + + def stream_slices(self, sync_mode: SyncMode, stream_state: StreamState) -> Iterable[StreamSlice]: + """ + Iterate over each parent stream's record and create a StreamSlice for each record. + + For each stream, iterate over its stream_slices. + For each stream slice, iterate over each record. + yield a stream slice for each such records. + + If a parent slice contains no record, emit a slice with parent_record=None. + + The template string can interpolate the following values: + - parent_stream_slice: mapping representing the parent's stream slice + - parent_record: mapping representing the parent record + - parent_stream_name: string representing the parent stream name + """ + if not self.parent_stream_configs: + yield from [] + else: + for parent_stream_config in self.parent_stream_configs: + parent_stream = parent_stream_config.stream + parent_field = parent_stream_config.parent_key + stream_state_field = parent_stream_config.stream_slice_field + for parent_stream_slice in parent_stream.stream_slices(sync_mode=sync_mode, cursor_field=None, stream_state=stream_state): + empty_parent_slice = True + parent_slice = parent_stream_slice.get("slice") + + for parent_record in parent_stream.read_records( + sync_mode=SyncMode.full_refresh, cursor_field=None, stream_slice=parent_stream_slice, stream_state=None + ): + empty_parent_slice = False + stream_state_value = parent_record.get(parent_field) + yield {stream_state_field: stream_state_value, "parent_slice": parent_slice} + # If the parent slice contains no records, + if empty_parent_slice: + stream_state_value = parent_stream_slice.get(parent_field) + yield {stream_state_field: stream_state_value, "parent_slice": parent_slice} diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py new file mode 100644 index 0000000000000..1963aeaf0fd53 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +# RecordTransformation is depended upon by every class in this module (since it's the abc everything implements). For this reason, +# the order of imports matters i.e: this file must fully import RecordTransformation before importing anything which depends on RecordTransformation +# Otherwise there will be a circular dependency (load order will be init.py --> RemoveFields (which tries to import RecordTransformation) --> +# init.py --> circular dep error, since loading this file causes it to try to import itself down the line. +# so we add the split directive below to tell isort to sort imports while keeping RecordTransformation as the first import +from .transformation import RecordTransformation + +# isort: split +from .add_fields import AddFields +from .remove_fields import RemoveFields + +__all__ = ["AddFields", "RecordTransformation", "RemoveFields"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py new file mode 100644 index 0000000000000..51ed5468acbd3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/add_fields.py @@ -0,0 +1,121 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass, field +from typing import Any, List, Mapping, Optional, Union + +import dpath.util +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.declarative.types import Config, FieldPointer, Record, StreamSlice, StreamState +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass(frozen=True) +class AddedFieldDefinition: + """Defines the field to add on a record""" + + path: FieldPointer + value: Union[InterpolatedString, str] + options: InitVar[Mapping[str, Any]] + + +@dataclass(frozen=True) +class ParsedAddFieldDefinition: + """Defines the field to add on a record""" + + path: FieldPointer + value: InterpolatedString + options: InitVar[Mapping[str, Any]] + + +@dataclass +class AddFields(RecordTransformation, JsonSchemaMixin): + """ + Transformation which adds field to an output record. The path of the added field can be nested. Adding nested fields will create all + necessary parent objects (like mkdir -p). Adding fields to an array will extend the array to that index (filling intermediate + indices with null values). So if you add a field at index 5 to the array ["value"], it will become ["value", null, null, null, null, + "new_value"]. + + + This transformation has access to the following contextual values: + record: the record about to be output by the connector + config: the input configuration provided to a connector + stream_state: the current state of the stream + stream_slice: the current stream slice being read + + + + Examples of instantiating this transformation via YAML: + - type: AddFields + fields: + # hardcoded constant + - path: ["path"] + value: "static_value" + + # nested path + - path: ["path", "to", "field"] + value: "static" + + # from config + - path: ["shop_id"] + value: "{{ config.shop_id }}" + + # from state + - path: ["current_state"] + value: "{{ stream_state.cursor_field }}" # Or {{ stream_state['cursor_field'] }} + + # from record + - path: ["unnested_value"] + value: {{ record.nested.field }} + + # from stream_slice + - path: ["start_date"] + value: {{ stream_slice.start_date }} + + # by supplying any valid Jinja template directive or expression https://jinja.palletsprojects.com/en/3.1.x/templates/# + - path: ["two_times_two"] + value: {{ 2 * 2 }} + + Attributes: + fields (List[AddedFieldDefinition]): A list of transformations (path and corresponding value) that will be added to the record + """ + + fields: List[AddedFieldDefinition] + options: InitVar[Mapping[str, Any]] + _parsed_fields: List[ParsedAddFieldDefinition] = field(init=False, repr=False, default_factory=list) + + def __post_init__(self, options: Mapping[str, Any]): + for add_field in self.fields: + if len(add_field.path) < 1: + raise f"Expected a non-zero-length path for the AddFields transformation {add_field}" + + if not isinstance(add_field.value, InterpolatedString): + if not isinstance(add_field.value, str): + raise f"Expected a string value for the AddFields transformation: {add_field}" + else: + self._parsed_fields.append( + ParsedAddFieldDefinition( + add_field.path, InterpolatedString.create(add_field.value, options=options), options=options + ) + ) + else: + self._parsed_fields.append(ParsedAddFieldDefinition(add_field.path, add_field.value, options={})) + + def transform( + self, + record: Record, + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> Record: + kwargs = {"record": record, "stream_state": stream_state, "stream_slice": stream_slice} + for parsed_field in self._parsed_fields: + value = parsed_field.value.eval(config, **kwargs) + dpath.util.new(record, parsed_field.path, value) + + return record + + def __eq__(self, other): + return self.__dict__ == other.__dict__ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py new file mode 100644 index 0000000000000..7c568a45941ed --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/remove_fields.py @@ -0,0 +1,58 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from dataclasses import InitVar, dataclass +from typing import Any, List, Mapping + +import dpath.exceptions +import dpath.util +from airbyte_cdk.sources.declarative.transformations import RecordTransformation +from airbyte_cdk.sources.declarative.types import FieldPointer, Record +from dataclasses_jsonschema import JsonSchemaMixin + + +@dataclass +class RemoveFields(RecordTransformation, JsonSchemaMixin): + """ + A transformation which removes fields from a record. The fields removed are designated using FieldPointers. + During transformation, if a field or any of its parents does not exist in the record, no error is thrown. + + If an input field pointer references an item in a list (e.g: ["k", 0] in the object {"k": ["a", "b", "c"]}) then + the object at that index is set to None rather than being not entirely removed from the list. TODO change this behavior. + + It's possible to remove objects nested in lists e.g: removing [".", 0, "k"] from {".": [{"k": "V"}]} results in {".": [{}]} + + Usage syntax: + + ```yaml + my_stream: + + transformations: + - type: RemoveFields + field_pointers: + - ["path", "to", "field1"] + - ["path2"] + ``` + + Attributes: + field_pointers (List[FieldPointer]): pointers to the fields that should be removed + """ + + field_pointers: List[FieldPointer] + options: InitVar[Mapping[str, Any]] + + def transform(self, record: Record, **kwargs) -> Record: + """ + :param record: The record to be transformed + :return: the input record with the requested fields removed + """ + for pointer in self.field_pointers: + # the dpath library by default doesn't delete fields from arrays + try: + dpath.util.delete(record, pointer) + except dpath.exceptions.PathNotFound: + # if the (potentially nested) property does not exist, silently skip + pass + + return record diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py new file mode 100644 index 0000000000000..1b2c429687d0a --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/transformations/transformation.py @@ -0,0 +1,37 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional + +from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState + + +@dataclass +class RecordTransformation(ABC): + """ + Implementations of this class define transformations that can be applied to records of a stream. + """ + + @abstractmethod + def transform( + self, + record: Record, + config: Optional[Config] = None, + stream_state: Optional[StreamState] = None, + stream_slice: Optional[StreamSlice] = None, + ) -> Record: + """ + Transform a record by adding, deleting, or mutating fields. + + :param record: The input record to be transformed + :param config: The user-provided configuration as specified by the source's spec + :param stream_state: The stream state + :param stream_slice: The stream slice + :return: The transformed record + """ + + def __eq__(self, other): + return other.__dict__ == self.__dict__ diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py new file mode 100644 index 0000000000000..c69405ba3eaa0 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/types.py @@ -0,0 +1,16 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from __future__ import annotations + +from typing import Any, List, Mapping + +Record = Mapping[str, Any] +# A FieldPointer designates a path to a field inside a mapping. For example, retrieving ["k1", "k1.2"] in the object {"k1" :{"k1.2": +# "hello"}] returns "hello" +FieldPointer = List[str] +Config = Mapping[str, Any] +ConnectionDefinition = Mapping[str, Any] +StreamSlice = Mapping[str, Any] +StreamState = Mapping[str, Any] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py b/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py new file mode 100644 index 0000000000000..bebecdfa2e2a3 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/declarative/yaml_declarative_source.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json +import logging +from typing import Any, List, Mapping + +from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker +from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource +from airbyte_cdk.sources.declarative.parsers.factory import DeclarativeComponentFactory +from airbyte_cdk.sources.declarative.parsers.yaml_parser import YamlParser +from airbyte_cdk.sources.streams import Stream + + +class YamlDeclarativeSource(DeclarativeSource): + """Declarative source defined by a yaml file""" + + def __init__(self, path_to_yaml): + """ + :param path_to_yaml: Path to the yaml file describing the source + """ + self.logger = logging.getLogger(f"airbyte.{self.name}") + self._factory = DeclarativeComponentFactory() + self._path_to_yaml = path_to_yaml + self._source_config = self._read_and_parse_yaml_file(path_to_yaml) + + @property + def connection_checker(self) -> ConnectionChecker: + check = self._source_config["check"] + if "class_name" not in check: + check["class_name"] = "airbyte_cdk.sources.declarative.checks.check_stream.CheckStream" + return self._factory.create_component(check, dict())(source=self) + + def streams(self, config: Mapping[str, Any]) -> List[Stream]: + self.logger.debug( + "parsed YAML into declarative source", + extra={"path_to_yaml_file": self._path_to_yaml, "source_name": self.name, "parsed_config": json.dumps(self._source_config)}, + ) + + stream_configs = self._source_config["streams"] + for s in stream_configs: + if "class_name" not in s: + s["class_name"] = "airbyte_cdk.sources.declarative.declarative_stream.DeclarativeStream" + return [self._factory.create_component(stream_config, config)() for stream_config in self._source_config["streams"]] + + def _read_and_parse_yaml_file(self, path_to_yaml_file): + with open(path_to_yaml_file, "r") as f: + config_content = f.read() + return YamlParser().parse(config_content) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/deprecated/base_source.py b/airbyte-cdk/python/airbyte_cdk/sources/deprecated/base_source.py index c1dd61c45d114..5196ee7724b44 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/deprecated/base_source.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/deprecated/base_source.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/airbyte_cdk/sources/deprecated/client.py b/airbyte-cdk/python/airbyte_cdk/sources/deprecated/client.py index 070648e94ca5d..0be3ea44e1603 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/deprecated/client.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/deprecated/client.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/airbyte_cdk/sources/singer/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/singer/__init__.py index b0f243a9be77c..6c76280f33c27 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/singer/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/singer/__init__.py @@ -3,6 +3,6 @@ # from .singer_helpers import SingerHelper, SyncModeInfo -from .source import SingerSource +from .source import ConfigContainer, SingerSource -__all__ = ["SingerSource", "SyncModeInfo", "SingerHelper"] +__all__ = ["ConfigContainer", "SingerSource", "SyncModeInfo", "SingerHelper"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/singer/singer_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/singer/singer_helpers.py index d279e3d7d3004..fdfe9deb5d54b 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/singer/singer_helpers.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/singer/singer_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -47,7 +47,7 @@ def configured_for_incremental(configured_stream: ConfiguredAirbyteStream): return configured_stream.sync_mode and configured_stream.sync_mode == SyncMode.incremental -def get_stream_level_metadata(metadatas: List[Dict[str, any]]) -> Optional[Dict[str, any]]: +def get_stream_level_metadata(metadatas: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]: for metadata in metadatas: if not is_field_metadata(metadata) and "metadata" in metadata: return metadata.get("metadata") @@ -67,7 +67,7 @@ class SyncModeInfo: default_cursor_field: Optional[List[str]] = None -def set_sync_modes_from_metadata(airbyte_stream: AirbyteStream, metadatas: List[Dict[str, any]]): +def set_sync_modes_from_metadata(airbyte_stream: AirbyteStream, metadatas: List[Dict[str, Any]]): stream_metadata = get_stream_level_metadata(metadatas) if stream_metadata: # A stream is incremental if it declares replication keys or if forced-replication-method is set to incremental @@ -102,11 +102,14 @@ class SingerHelper: def _transform_types(stream_properties: DefaultDict): for field_name in stream_properties: field_object = stream_properties[field_name] - field_object["type"] = SingerHelper._parse_type(field_object["type"]) + # according to issue CDK: typing errors #9500, mypy raises error on this line + # '"Type[SingerHelper]" has no attribute "_parse_type"', it's need to fix + # ignored for now + field_object["type"] = SingerHelper._parse_type(field_object["type"]) # type: ignore @staticmethod def singer_catalog_to_airbyte_catalog( - singer_catalog: Dict[str, any], sync_mode_overrides: Dict[str, SyncModeInfo], primary_key_overrides: Dict[str, List[str]] + singer_catalog: Dict[str, Any], sync_mode_overrides: Dict[str, SyncModeInfo], primary_key_overrides: Dict[str, List[str]] ) -> AirbyteCatalog: """ :param singer_catalog: @@ -116,7 +119,11 @@ def singer_catalog_to_airbyte_catalog( :return: Airbyte Catalog """ airbyte_streams = [] - for stream in singer_catalog.get("streams"): + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'Item "None" of "Optional[Any]" has no attribute "__iter__" (not iterable)' + # It occurs because default value isn't set, and it's None + # It's needed to set default value, ignored for now + for stream in singer_catalog.get("streams"): # type: ignore name = stream.get("stream") schema = stream.get("schema") airbyte_stream = AirbyteStream(name=name, json_schema=schema) @@ -154,9 +161,17 @@ def get_catalogs( singer_catalog = SingerHelper._read_singer_catalog(logger, shell_command) streams = singer_catalog.get("streams", []) if streams and excluded_streams: - singer_catalog["streams"] = [stream for stream in streams if stream["stream"] not in excluded_streams] - - airbyte_catalog = SingerHelper.singer_catalog_to_airbyte_catalog(singer_catalog, sync_mode_overrides, primary_key_overrides) + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'Unsupported target for indexed assignment ("Mapping[str, Any]")' + # _read_singer_catalog returns Mapping, to fix this error it should be changed to MutableMapping + # ignored for now + singer_catalog["streams"] = [stream for stream in streams if stream["stream"] not in excluded_streams] # type: ignore + + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'Argument 1 to "singer_catalog_to_airbyte_catalog" of "SingerHelper" has incompatible type "Mapping[str, Any]"; expected "Dict[str, Any]"' + # singer_catalog is Mapping, because _read_singer_catalog returns Mapping, but singer_catalog_to_airbyte_catalog expects Dict + # it's needed to check and fix, ignored for now + airbyte_catalog = SingerHelper.singer_catalog_to_airbyte_catalog(singer_catalog, sync_mode_overrides, primary_key_overrides) # type: ignore return Catalogs(singer_catalog=singer_catalog, airbyte_catalog=airbyte_catalog) @staticmethod @@ -177,14 +192,22 @@ def read(logger, shell_command, is_message=(lambda x: True)) -> Iterator[Airbyte @staticmethod def _read_lines(process: subprocess.Popen) -> Iterator[Tuple[str, TextIOWrapper]]: sel = selectors.DefaultSelector() - sel.register(process.stdout, selectors.EVENT_READ) - sel.register(process.stderr, selectors.EVENT_READ) + # according to issue CDK: typing errors #9500, mypy raises error on this two lines + # 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"' + # 'Argument 1 to "register" of "DefaultSelector" has incompatible type "Optional[IO[Any]]"; expected "Union[int, HasFileno]"' + # It's need to check, ignored for now + sel.register(process.stdout, selectors.EVENT_READ) # type: ignore + sel.register(process.stderr, selectors.EVENT_READ) # type: ignore eof = False while not eof: selects_list = sel.select() empty_line_counter = 0 for key, _ in selects_list: - line = key.fileobj.readline() + # according to issue CDK: typing errors #9500, mypy raises two errors on these lines + # 'Item "int" of "Union[int, HasFileno]" has no attribute "readline"' + # 'Item "HasFileno" of "Union[int, HasFileno]" has no attribute "readline"' + # It's need to check, ignored for now + line = key.fileobj.readline() # type: ignore if not line: empty_line_counter += 1 if empty_line_counter >= len(selects_list): @@ -193,12 +216,21 @@ def _read_lines(process: subprocess.Popen) -> Iterator[Tuple[str, TextIOWrapper] try: process.wait(timeout=60) except subprocess.TimeoutExpired: - raise Exception(f"Underlying command {process.args} is hanging") + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior' + # It's need to fix, ignored for now + raise Exception(f"Underlying command {process.args} is hanging") # type: ignore if process.returncode != 0: - raise Exception(f"Underlying command {process.args} failed with exit code {process.returncode}") + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'On Python 3 '{}'.format(b'abc') produces "b'abc'", not 'abc'; use '{!r}'.format(b'abc') if this is desired behavior' + # It's need to fix, ignored for now + raise Exception(f"Underlying command {process.args} failed with exit code {process.returncode}") # type: ignore else: - yield line, key.fileobj + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'Incompatible types in "yield" (actual type "Tuple[Any, Union[int, HasFileno]]", expected type "Tuple[str, TextIOWrapper]")' + # It's need to fix, ignored for now + yield line, key.fileobj # type: ignore @staticmethod def _airbyte_message_from_json(transformed_json: Mapping[str, Any]) -> Optional[AirbyteMessage]: @@ -210,7 +242,12 @@ def _airbyte_message_from_json(transformed_json: Mapping[str, Any]) -> Optional[ else: # todo: check that messages match the discovered schema stream_name = transformed_json["stream"] - out_record = AirbyteRecordMessage( + # according to issue CDK: typing errors #9500, mypy raises error on this line + # 'Incompatible types in assignment (expression has type "AirbyteRecordMessage", variable has type "AirbyteStateMessage")' + # type of out_record is first initialized as AirbyteStateMessage on the line 240 + # however AirbyteRecordMessage is assigned on the line below, it causes error + # ignored + out_record = AirbyteRecordMessage( # type: ignore stream=stream_name, data=transformed_json["record"], emitted_at=int(datetime.now().timestamp()) * 1000, @@ -227,7 +264,11 @@ def create_singer_catalog_with_selection(masked_airbyte_catalog: ConfiguredAirby configured_stream.stream.name: configured_stream for configured_stream in masked_airbyte_catalog.streams } - for singer_stream in discovered_singer_catalog.get("streams"): + # according to issue CDK: typing errors #9500, mypy raises error on this line + # '"object" has no attribute "get"' + # discovered_singer_catalog type is set to object on the line 259, need to check + # ignored for now + for singer_stream in discovered_singer_catalog.get("streams"): # type: ignore stream_name = singer_stream.get("stream") if stream_name in stream_name_to_configured_stream: new_metadatas = [] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/singer/source.py b/airbyte-cdk/python/airbyte_cdk/sources/singer/source.py index b33071606c3a4..d70beaa2a43fb 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/singer/source.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/singer/source.py @@ -1,76 +1,72 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import json +import logging import os -from dataclasses import dataclass -from typing import Dict, Iterable, List, Type +from typing import Any, Dict, Iterable, List, Mapping, Type -from airbyte_cdk.logger import AirbyteLogger from airbyte_cdk.models import AirbyteCatalog, AirbyteConnectionStatus, AirbyteMessage, ConfiguredAirbyteCatalog, Status -from airbyte_cdk.sources.source import Source +from airbyte_cdk.sources.source import BaseSource from airbyte_cdk.sources.utils.catalog_helpers import CatalogHelper from .singer_helpers import Catalogs, SingerHelper, SyncModeInfo -@dataclass -class ConfigContainer: - config: json +class ConfigContainer(Dict[str, Any]): config_path: str + def __init__(self, config, config_path): + super().__init__(config) + self.config_path = config_path -class SingerSource(Source): - # can be overridden to change an input config - def configure(self, raw_config: json, temp_dir: str) -> json: +class SingerSource(BaseSource[ConfigContainer, str, str]): + def configure(self, config: Mapping[str, Any], temp_dir: str) -> ConfigContainer: """ Persist raw_config in temporary directory to run the Source job This can be overridden if extra temporary files need to be persisted in the temp dir """ - config = self.transform_config(raw_config) config_path = os.path.join(temp_dir, "config.json") + config = ConfigContainer(self.transform_config(config), config_path) self.write_config(config, config_path) - return ConfigContainer(config, config_path) + return config # Can be overridden to change an input config - def transform_config(self, config: json) -> json: + def transform_config(self, config: Mapping[str, Any]) -> Mapping[str, Any]: """ Singer source may need to adapt the Config object for the singer tap specifics """ return config - # Overriding to change an input catalog as path instead def read_catalog(self, catalog_path: str) -> str: """ Since singer source don't need actual catalog object, we override this to return path only """ return catalog_path - # Overriding to change an input state as path instead def read_state(self, state_path: str) -> str: """ Since singer source don't need actual state object, we override this to return path only """ return state_path - def check_config(self, logger: AirbyteLogger, config_path: str, config: json) -> AirbyteConnectionStatus: + def check_config(self, logger: logging.Logger, config_path: str, config: ConfigContainer) -> AirbyteConnectionStatus: """ Some Singer source may perform check using config_path or config to tests if the input configuration can be used to successfully connect to the integration """ raise NotImplementedError - def discover_cmd(self, logger: AirbyteLogger, config_path: str) -> str: + def discover_cmd(self, logger: logging.Logger, config_path: str) -> str: """ Returns the command used to run discovery in the singer tap. For example, if the bash command used to invoke the singer tap is `tap-postgres`, and the config JSON lived in "/path/config.json", this method would return "tap-postgres --config /path/config.json" """ raise NotImplementedError - def read_cmd(self, logger: AirbyteLogger, config_path: str, catalog_path: str, state_path: str = None) -> str: + def read_cmd(self, logger: logging.Logger, config_path: str, catalog_path: str, state_path: str = None) -> str: """ Returns the command used to read data from the singer tap. For example, if the bash command used to invoke the singer tap is `tap-postgres`, and the config JSON lived in "/path/config.json", and the catalog was in "/path/catalog.json", @@ -78,39 +74,34 @@ def read_cmd(self, logger: AirbyteLogger, config_path: str, catalog_path: str, s """ raise NotImplementedError - def _discover_internal(self, logger: AirbyteLogger, config_path: str) -> Catalogs: + def _discover_internal(self, logger: logging.Logger, config_path: str) -> Catalogs: cmd = self.discover_cmd(logger, config_path) catalogs = SingerHelper.get_catalogs( logger, cmd, self.get_sync_mode_overrides(), self.get_primary_key_overrides(), self.get_excluded_streams() ) return catalogs - def check(self, logger: AirbyteLogger, config_container: ConfigContainer) -> AirbyteConnectionStatus: + def check(self, logger: logging.Logger, config: ConfigContainer) -> AirbyteConnectionStatus: """ Tests if the input configuration can be used to successfully connect to the integration """ - return self.check_config(logger, config_container.config_path, config_container.config) + return self.check_config(logger, config.config_path, config) - def discover(self, logger: AirbyteLogger, config_container) -> AirbyteCatalog: + def discover(self, logger: logging.Logger, config: ConfigContainer) -> AirbyteCatalog: """ Implements the parent class discover method. """ - if isinstance(config_container, ConfigContainer): - return self._discover_internal(logger, config_container.config_path).airbyte_catalog - else: - return self._discover_internal(logger, config_container).airbyte_catalog + return self._discover_internal(logger, config.config_path).airbyte_catalog - def read( - self, logger: AirbyteLogger, config_container: ConfigContainer, catalog_path: str, state_path: str = None - ) -> Iterable[AirbyteMessage]: + def read(self, logger: logging.Logger, config: ConfigContainer, catalog_path: str, state_path: str = None) -> Iterable[AirbyteMessage]: """ Implements the parent class read method. """ - catalogs = self._discover_internal(logger, config_container.config_path) + catalogs = self._discover_internal(logger, config.config_path) masked_airbyte_catalog = ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) selected_singer_catalog_path = SingerHelper.create_singer_catalog_with_selection(masked_airbyte_catalog, catalogs.singer_catalog) - read_cmd = self.read_cmd(logger, config_container.config_path, selected_singer_catalog_path, state_path) + read_cmd = self.read_cmd(logger, config.config_path, selected_singer_catalog_path, state_path) return SingerHelper.read(logger, read_cmd) def get_sync_mode_overrides(self) -> Dict[str, SyncModeInfo]: @@ -149,7 +140,7 @@ def get_excluded_streams(self) -> List[str]: class BaseSingerSource(SingerSource): force_full_refresh = False - def check_config(self, logger: AirbyteLogger, config_path: str, config: json) -> AirbyteConnectionStatus: + def check_config(self, logger: logging.Logger, config_path: str, config: Mapping[str, Any]) -> AirbyteConnectionStatus: try: self.try_connect(logger, config) except self.api_error as err: @@ -159,23 +150,23 @@ def check_config(self, logger: AirbyteLogger, config_path: str, config: json) -> return AirbyteConnectionStatus(status=Status.FAILED, message=error_msg) return AirbyteConnectionStatus(status=Status.SUCCEEDED) - def discover_cmd(self, logger: AirbyteLogger, config_path: str) -> str: + def discover_cmd(self, logger: logging.Logger, config_path: str) -> str: return f"{self.tap_cmd} --config {config_path} --discover" - def read_cmd(self, logger: AirbyteLogger, config_path: str, catalog_path: str, state_path: str = None) -> str: + def read_cmd(self, logger: logging.Logger, config_path: str, catalog_path: str, state_path: str = None) -> str: state_path = None if self.force_full_refresh else state_path args = {"--config": config_path, "--catalog": catalog_path, "--state": state_path} cmd = " ".join([f"{k} {v}" for k, v in args.items() if v is not None]) return f"{self.tap_cmd} {cmd}" - def discover(self, logger: AirbyteLogger, config_container: ConfigContainer) -> AirbyteCatalog: - catalog = super().discover(logger, config_container) + def discover(self, logger: logging.Logger, config: ConfigContainer) -> AirbyteCatalog: + catalog = super().discover(logger, config) if self.force_full_refresh: return CatalogHelper.coerce_catalog_as_full_refresh(catalog) return catalog - def try_connect(self, logger: AirbyteLogger, config: json): + def try_connect(self, logger: logging.Logger, config: Mapping[str, Any]): """Test provided credentials, raises self.api_error if something goes wrong""" raise NotImplementedError diff --git a/airbyte-cdk/python/airbyte_cdk/sources/source.py b/airbyte-cdk/python/airbyte_cdk/sources/source.py index 5e0396b3fcbd9..de0c1be2cceb7 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/source.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/source.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -7,37 +7,48 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict -from typing import Any, Dict, Iterable, Mapping, MutableMapping +from typing import Any, Dict, Generic, Iterable, Mapping, MutableMapping, TypeVar -from airbyte_cdk.connector import Connector +from airbyte_cdk.connector import BaseConnector, DefaultConnectorMixin, TConfig from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, ConfiguredAirbyteCatalog +TState = TypeVar("TState") +TCatalog = TypeVar("TCatalog") -class Source(Connector, ABC): - # can be overridden to change an input state - def read_state(self, state_path: str) -> Dict[str, Any]: - if state_path: - state_obj = json.loads(open(state_path, "r").read()) - else: - state_obj = {} - state = defaultdict(dict, state_obj) - return state - # can be overridden to change an input catalog - def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: - return ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) +class BaseSource(BaseConnector[TConfig], ABC, Generic[TConfig, TState, TCatalog]): + @abstractmethod + def read_state(self, state_path: str) -> TState: + ... + + @abstractmethod + def read_catalog(self, catalog_path: str) -> TCatalog: + ... @abstractmethod - def read( - self, logger: logging.Logger, config: Mapping[str, Any], catalog: ConfiguredAirbyteCatalog, state: MutableMapping[str, Any] = None - ) -> Iterable[AirbyteMessage]: + def read(self, logger: logging.Logger, config: TConfig, catalog: TCatalog, state: TState = None) -> Iterable[AirbyteMessage]: """ Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state. """ @abstractmethod - def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog: + def discover(self, logger: logging.Logger, config: TConfig) -> AirbyteCatalog: """ Returns an AirbyteCatalog representing the available streams and fields in this integration. For example, given valid credentials to a Postgres database, returns an Airbyte catalog where each postgres table is a stream, and each table column is a field. """ + + +class Source(DefaultConnectorMixin, BaseSource[Mapping[str, Any], MutableMapping[str, Any], ConfiguredAirbyteCatalog], ABC): + # can be overridden to change an input state + def read_state(self, state_path: str) -> Dict[str, Any]: + if state_path: + state_obj = json.loads(open(state_path, "r").read()) + else: + state_obj = {} + state = defaultdict(dict, state_obj) + return state + + # can be overridden to change an input catalog + def read_catalog(self, catalog_path: str) -> ConfiguredAirbyteCatalog: + return ConfiguredAirbyteCatalog.parse_obj(self.read_config(catalog_path)) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py index f4c4bd22b35c5..0df89f871a520 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/__init__.py @@ -3,6 +3,6 @@ # # Initialize Streams Package -from .core import Stream +from .core import IncrementalMixin, Stream -__all__ = ["Stream"] +__all__ = ["IncrementalMixin", "Stream"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py index 9aea6d7d15084..02199df40c31f 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/core.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -12,6 +12,7 @@ from airbyte_cdk.models import AirbyteStream, SyncMode from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer +from deprecated.classic import deprecated def package_name_from_class(cls: object) -> str: @@ -20,6 +21,40 @@ def package_name_from_class(cls: object) -> str: return module.__name__.split(".")[0] +class IncrementalMixin(ABC): + """Mixin to make stream incremental. + + class IncrementalStream(Stream, IncrementalMixin): + @property + def state(self): + return self._state + + @state.setter + def state(self, value): + self._state[self.cursor_field] = value[self.cursor_field] + """ + + @property + @abstractmethod + def state(self) -> MutableMapping[str, Any]: + """State getter, should return state in form that can serialized to a string and send to the output + as a STATE AirbyteMessage. + + A good example of a state is a cursor_value: + { + self.cursor_field: "cursor_value" + } + + State should try to be as small as possible but at the same time descriptive enough to restore + syncing process from the point where it stopped. + """ + + @state.setter + @abstractmethod + def state(self, value: MutableMapping[str, Any]): + """State setter, accept state serialized by state getter.""" + + class Stream(ABC): """ Base abstract class for an Airbyte Stream. Makes no assumption of the Stream's underlying transport protocol. @@ -40,6 +75,18 @@ def name(self) -> str: """ return casing.camel_to_snake(self.__class__.__name__) + def get_error_display_message(self, exception: BaseException) -> Optional[str]: + """ + Retrieves the user-friendly display message that corresponds to an exception. + This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. + + The default implementation of this method does not return user-friendly messages for any exception type, but it should be overriden as needed. + + :param exception: The exception that was raised + :return: A user-friendly message that indicates the cause of the error + """ + return None + @abstractmethod def read_records( self, @@ -65,6 +112,9 @@ def get_json_schema(self) -> Mapping[str, Any]: def as_airbyte_stream(self) -> AirbyteStream: stream = AirbyteStream(name=self.name, json_schema=dict(self.get_json_schema()), supported_sync_modes=[SyncMode.full_refresh]) + if self.namespace: + stream.namespace = self.namespace + if self.supports_incremental: stream.source_defined_cursor = self.source_defined_cursor stream.supported_sync_modes.append(SyncMode.incremental) # type: ignore @@ -94,6 +144,14 @@ def cursor_field(self) -> Union[str, List[str]]: """ return [] + @property + def namespace(self) -> Optional[str]: + """ + Override to return the namespace of this stream, e.g. the Postgres schema which this stream will emit records for. + :return: A string containing the name of the namespace. + """ + return None + @property def source_defined_cursor(self) -> bool: """ @@ -136,9 +194,9 @@ def state_checkpoint_interval(self) -> Optional[int]: """ return None + @deprecated(version="0.1.49", reason="You should use explicit state property instead, see IncrementalMixin docs.") def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]): - """ - Override to extract state from the latest record. Needed to implement incremental sync. + """Override to extract state from the latest record. Needed to implement incremental sync. Inspects the latest record extracted from the data source and the current state object and return an updated state object. @@ -169,7 +227,7 @@ def _wrapped_primary_key(keys: Optional[Union[str, List[str], List[List[str]]]]) elif isinstance(component, list): wrapped_keys.append(component) else: - raise ValueError("Element must be either list or str.") + raise ValueError(f"Element must be either list or str. Got: {type(component)}") return wrapped_keys else: - raise ValueError("Element must be either list or str.") + raise ValueError(f"Element must be either list or str. Got: {type(keys)}") diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/__init__.py index 32a5245229e91..494c395d3ad3a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/__init__.py @@ -1,13 +1,14 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # # Initialize Auth Package from .core import HttpAuthenticator, NoAuth from .oauth import Oauth2Authenticator -from .token import MultipleTokenAuthenticator, TokenAuthenticator +from .token import BasicHttpAuthenticator, MultipleTokenAuthenticator, TokenAuthenticator __all__ = [ + "BasicHttpAuthenticator", "HttpAuthenticator", "NoAuth", "Oauth2Authenticator", diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/core.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/core.py index cd47c3b406f7d..4dd892baa2a63 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/core.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/core.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/oauth.py index 1885aa9f30ab9..2ec43ed5a4255 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/oauth.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/oauth.py @@ -1,9 +1,9 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -from typing import Any, List, Mapping, MutableMapping, Tuple +from typing import Any, List, Mapping, MutableMapping, Optional, Tuple import pendulum import requests @@ -26,7 +26,8 @@ def __init__( client_secret: str, refresh_token: str, scopes: List[str] = None, - refresh_access_token_headers: Mapping[str, Any] = None, + refresh_access_token_headers: Optional[Mapping[str, Any]] = None, + refresh_access_token_authenticator: Optional[HttpAuthenticator] = None, ): self.token_refresh_endpoint = token_refresh_endpoint self.client_secret = client_secret @@ -34,6 +35,7 @@ def __init__( self.refresh_token = refresh_token self.scopes = scopes self.refresh_access_token_headers = refresh_access_token_headers + self.refresh_access_token_authenticator = refresh_access_token_authenticator self._token_expiry_date = pendulum.now().subtract(days=1) self._access_token = None @@ -76,10 +78,19 @@ def refresh_access_token(self) -> Tuple[str, int]: method="POST", url=self.token_refresh_endpoint, data=self.get_refresh_request_body(), - headers=self.refresh_access_token_headers, + headers=self.get_refresh_access_token_headers(), ) response.raise_for_status() response_json = response.json() return response_json["access_token"], response_json["expires_in"] except Exception as e: raise Exception(f"Error while refreshing access token: {e}") from e + + def get_refresh_access_token_headers(self): + headers = {} + if self.refresh_access_token_headers: + headers = self.refresh_access_token_headers + if self.refresh_access_token_authenticator: + refresh_auth_headers = self.refresh_access_token_authenticator.get_auth_header() + headers.update(refresh_auth_headers) + return headers diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/token.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/token.py index 54cb3a318c625..938ec27f87f58 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/token.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/auth/token.py @@ -1,8 +1,9 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # +import base64 from itertools import cycle from typing import Any, List, Mapping @@ -32,3 +33,15 @@ def __init__(self, tokens: List[str], auth_method: str = "Bearer", auth_header: def get_auth_header(self) -> Mapping[str, Any]: return {self.auth_header: f"{self.auth_method} {next(self._tokens_iter)}"} + + +class BasicHttpAuthenticator(TokenAuthenticator): + """ + Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using bas64 + https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme + """ + + def __init__(self, username: str, password: str, auth_method: str = "Basic", auth_header: str = "Authorization"): + auth_string = f"{username}:{password}".encode("utf8") + b64_encoded = base64.b64encode(auth_string).decode("utf8") + super().__init__(b64_encoded, auth_method, auth_header) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py index b309a6ce27dd8..a2a91da61493e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/exceptions.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -9,7 +9,9 @@ class BaseBackoffException(requests.exceptions.HTTPError): - pass + def __init__(self, request: requests.PreparedRequest, response: requests.Response): + error_message = f"Request URL: {request.url}, Response Code: {response.status_code}, Response Text: {response.text}" + super().__init__(error_message, request=request, response=response) class RequestBodyException(Exception): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py index f2b1f06597632..8bcba22f1573a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -14,7 +14,6 @@ import vcr.cassette as Cassette from airbyte_cdk.models import SyncMode from airbyte_cdk.sources.streams.core import Stream -from airbyte_cdk.sources.utils.sentry import AirbyteSentry from requests.auth import AuthBase from .auth.core import HttpAuthenticator, NoAuth @@ -22,7 +21,7 @@ from .rate_limiting import default_backoff_handler, user_defined_backoff_handler # list of all possible HTTP methods which can be used for sending of request bodies -BODY_REQUEST_METHODS = ("POST", "PUT", "PATCH") +BODY_REQUEST_METHODS = ("GET", "POST", "PUT", "PATCH") logging.getLogger("vcr").setLevel(logging.ERROR) @@ -248,7 +247,12 @@ def backoff_time(self, response: requests.Response) -> Optional[float]: return None def _create_prepared_request( - self, path: str, headers: Mapping = None, params: Mapping = None, json: Any = None, data: Any = None + self, + path: str, + headers: Mapping = None, + params: Mapping = None, + json: Any = None, + data: Any = None, ) -> requests.PreparedRequest: args = {"method": self.http_method, "url": urljoin(self.url_base, path), "headers": headers, "params": params} if self.http_method.upper() in BODY_REQUEST_METHODS: @@ -282,10 +286,11 @@ def _send(self, request: requests.PreparedRequest, request_kwargs: Mapping[str, Unexpected transient exceptions use the default backoff parameters. Unexpected persistent exceptions are not handled and will cause the sync to fail. """ - AirbyteSentry.add_breadcrumb(message=f"Issue {request.url}", data=request_kwargs) - with AirbyteSentry.start_transaction_span(op="_send", description=request.url): - response: requests.Response = self._session.send(request, **request_kwargs) - + self.logger.debug( + "Making outbound API request", extra={"headers": request.headers, "url": request.url, "request_body": request.body} + ) + response: requests.Response = self._session.send(request, **request_kwargs) + self.logger.debug("Receiving response", extra={"headers": response.headers, "status": response.status_code, "body": response.text}) if self.should_retry(response): custom_backoff_time = self.backoff_time(response) if custom_backoff_time: @@ -294,8 +299,11 @@ def _send(self, request: requests.PreparedRequest, request_kwargs: Mapping[str, raise DefaultBackoffException(request=request, response=response) elif self.raise_on_http_errors: # Raise any HTTP exceptions that happened in case there were unexpected ones - response.raise_for_status() - + try: + response.raise_for_status() + except requests.HTTPError as exc: + self.logger.error(response.text) + raise exc return response def _send_request(self, request: requests.PreparedRequest, request_kwargs: Mapping[str, Any]) -> requests.Response: @@ -325,12 +333,59 @@ def _send_request(self, request: requests.PreparedRequest, request_kwargs: Mappi """ if max_tries is not None: max_tries = max(0, max_tries) + 1 - AirbyteSentry.set_context("request", {"url": request.url, "headers": request.headers, "args": request_kwargs}) - with AirbyteSentry.start_transaction_span(op="_send_request"): - user_backoff_handler = user_defined_backoff_handler(max_tries=max_tries)(self._send) - backoff_handler = default_backoff_handler(max_tries=max_tries, factor=self.retry_factor) - return backoff_handler(user_backoff_handler)(request, request_kwargs) + user_backoff_handler = user_defined_backoff_handler(max_tries=max_tries)(self._send) + backoff_handler = default_backoff_handler(max_tries=max_tries, factor=self.retry_factor) + return backoff_handler(user_backoff_handler)(request, request_kwargs) + + @classmethod + def parse_response_error_message(cls, response: requests.Response) -> Optional[str]: + """ + Parses the raw response object from a failed request into a user-friendly error message. + By default, this method tries to grab the error message from JSON responses by following common API patterns. Override to parse differently. + + :param response: + :return: A user-friendly message that indicates the cause of the error + """ + + # default logic to grab error from common fields + def _try_get_error(value): + if isinstance(value, str): + return value + elif isinstance(value, list): + return ", ".join(_try_get_error(v) for v in value) + elif isinstance(value, dict): + new_value = ( + value.get("message") + or value.get("messages") + or value.get("error") + or value.get("errors") + or value.get("failures") + or value.get("failure") + ) + return _try_get_error(new_value) + return None + + try: + body = response.json() + return _try_get_error(body) + except requests.exceptions.JSONDecodeError: + return None + + def get_error_display_message(self, exception: BaseException) -> Optional[str]: + """ + Retrieves the user-friendly display message that corresponds to an exception. + This will be called when encountering an exception while reading records from the stream, and used to build the AirbyteTraceMessage. + + The default implementation of this method only handles HTTPErrors by passing the response to self.parse_response_error_message(). + The method should be overriden as needed to handle any additional exception types. + + :param exception: The exception that was raised + :return: A user-friendly message that indicates the cause of the error + """ + if isinstance(exception, requests.HTTPError): + return self.parse_response_error_message(exception.response) + return None def read_records( self, @@ -343,38 +398,35 @@ def read_records( pagination_complete = False next_page_token = None - with AirbyteSentry.start_transaction("read_records", self.name), AirbyteSentry.start_transaction_span("read_records"): - while not pagination_complete: - request_headers = self.request_headers( - stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token - ) - request = self._create_prepared_request( - path=self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), - headers=dict(request_headers, **self.authenticator.get_auth_header()), - params=self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), - json=self.request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), - data=self.request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), - ) - request_kwargs = self.request_kwargs(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) - - if self.use_cache: - # use context manager to handle and store cassette metadata - with self.cache_file as cass: - self.cassete = cass - # vcr tries to find records based on the request, if such records exist, return from cache file - # else make a request and save record in cache file - response = self._send_request(request, request_kwargs) - - else: + while not pagination_complete: + request_headers = self.request_headers(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + request = self._create_prepared_request( + path=self.path(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + headers=dict(request_headers, **self.authenticator.get_auth_header()), + params=self.request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + json=self.request_body_json(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + data=self.request_body_data(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token), + ) + request_kwargs = self.request_kwargs(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token) + + if self.use_cache: + # use context manager to handle and store cassette metadata + with self.cache_file as cass: + self.cassete = cass + # vcr tries to find records based on the request, if such records exist, return from cache file + # else make a request and save record in cache file response = self._send_request(request, request_kwargs) - yield from self.parse_response(response, stream_state=stream_state, stream_slice=stream_slice) - next_page_token = self.next_page_token(response) - if not next_page_token: - pagination_complete = True + else: + response = self._send_request(request, request_kwargs) + yield from self.parse_response(response, stream_state=stream_state, stream_slice=stream_slice) + + next_page_token = self.next_page_token(response) + if not next_page_token: + pagination_complete = True - # Always return an empty generator just in case no records were ever yielded - yield from [] + # Always return an empty generator just in case no records were ever yielded + yield from [] class HttpSubStream(HttpStream, ABC): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py index 2401e51005d52..baf1412116890 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/rate_limiting.py @@ -1,22 +1,26 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # - +import logging import sys import time from typing import Optional import backoff -from airbyte_cdk.logger import AirbyteLogger from requests import codes, exceptions from .exceptions import DefaultBackoffException, UserDefinedBackoffException -TRANSIENT_EXCEPTIONS = (DefaultBackoffException, exceptions.ConnectTimeout, exceptions.ReadTimeout, exceptions.ConnectionError) +TRANSIENT_EXCEPTIONS = ( + DefaultBackoffException, + exceptions.ConnectTimeout, + exceptions.ReadTimeout, + exceptions.ConnectionError, + exceptions.ChunkedEncodingError, +) -# TODO inject singleton logger? -logger = AirbyteLogger() +logger = logging.getLogger("airbyte") def default_backoff_handler(max_tries: Optional[int], factor: float, **kwargs): diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py index 08bdcd068be54..c4f64a971ea0e 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py @@ -1,12 +1,8 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # from .oauth import Oauth2Authenticator -from .token import MultipleTokenAuthenticator, TokenAuthenticator +from .token import BasicHttpAuthenticator, MultipleTokenAuthenticator, TokenAuthenticator -__all__ = [ - "Oauth2Authenticator", - "TokenAuthenticator", - "MultipleTokenAuthenticator", -] +__all__ = ["Oauth2Authenticator", "TokenAuthenticator", "MultipleTokenAuthenticator", "BasicHttpAuthenticator"] diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py new file mode 100644 index 0000000000000..2a8bd72833715 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py @@ -0,0 +1,129 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Any, List, Mapping, MutableMapping, Tuple + +import pendulum +import requests +from requests.auth import AuthBase + + +class AbstractOauth2Authenticator(AuthBase): + """ + Abstract class for an OAuth authenticators that implements the OAuth token refresh flow. The authenticator + is designed to generically perform the refresh flow without regard to how config fields are get/set by + delegating that behavior to the classes implementing the interface. + """ + + def __call__(self, request: requests.Request) -> requests.Request: + """Attach the HTTP headers required to authenticate on the HTTP request""" + request.headers.update(self.get_auth_header()) + return request + + def get_auth_header(self) -> Mapping[str, Any]: + """HTTP header to set on the requests""" + return {"Authorization": f"Bearer {self.get_access_token()}"} + + def get_access_token(self) -> str: + """Returns the access token""" + if self.token_has_expired(): + t0 = pendulum.now() + token, expires_in = self.refresh_access_token() + self.access_token = token + self.set_token_expiry_date(t0.add(seconds=expires_in)) + + return self.access_token + + def token_has_expired(self) -> bool: + """Returns True if the token is expired""" + return pendulum.now() > self.get_token_expiry_date() + + def build_refresh_request_body(self) -> Mapping[str, Any]: + """ + Returns the request body to set on the refresh request + + Override to define additional parameters + """ + payload: MutableMapping[str, Any] = { + "grant_type": "refresh_token", + "client_id": self.get_client_id(), + "client_secret": self.get_client_secret(), + "refresh_token": self.get_refresh_token(), + } + + if self.get_scopes: + payload["scopes"] = self.get_scopes() + + if self.get_refresh_request_body(): + for key, val in self.get_refresh_request_body().items(): + # We defer to existing oauth constructs over custom configured fields + if key not in payload: + payload[key] = val + + return payload + + def refresh_access_token(self) -> Tuple[str, int]: + """ + Returns the refresh token and its lifespan in seconds + + :return: a tuple of (access_token, token_lifespan_in_seconds) + """ + try: + response = requests.request(method="POST", url=self.get_token_refresh_endpoint(), data=self.build_refresh_request_body()) + response.raise_for_status() + response_json = response.json() + return response_json[self.get_access_token_name()], response_json[self.get_expires_in_name()] + except Exception as e: + raise Exception(f"Error while refreshing access token: {e}") from e + + @abstractmethod + def get_token_refresh_endpoint(self) -> str: + """Returns the endpoint to refresh the access token""" + + @abstractmethod + def get_client_id(self) -> str: + """The client id to authenticate""" + + @abstractmethod + def get_client_secret(self) -> str: + """The client secret to authenticate""" + + @abstractmethod + def get_refresh_token(self) -> str: + """The token used to refresh the access token when it expires""" + + @abstractmethod + def get_scopes(self) -> List[str]: + """List of requested scopes""" + + @abstractmethod + def get_token_expiry_date(self) -> pendulum.datetime: + """Expiration date of the access token""" + + @abstractmethod + def set_token_expiry_date(self, value: pendulum.datetime): + """Setter for access token expiration date""" + + @abstractmethod + def get_access_token_name(self) -> str: + """Field to extract access token from in the response""" + + @abstractmethod + def get_expires_in_name(self) -> str: + """Returns the expires_in field name""" + + @abstractmethod + def get_refresh_request_body(self) -> Mapping[str, Any]: + """Returns the request body to set on the refresh request""" + + @property + @abstractmethod + def access_token(self) -> str: + """Returns the access token""" + + @access_token.setter + @abstractmethod + def access_token(self, value: str) -> str: + """Setter for the access token""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py new file mode 100644 index 0000000000000..d416499bcafe9 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from abc import abstractmethod +from typing import Any, Mapping + +from requests.auth import AuthBase + + +class AbstractHeaderAuthenticator(AuthBase): + """Abstract class for an header-based authenticators that add a header to outgoing HTTP requests.""" + + def __call__(self, request): + """Attach the HTTP headers required to authenticate on the HTTP request""" + request.headers.update(self.get_auth_header()) + return request + + def get_auth_header(self) -> Mapping[str, Any]: + """The header to set on outgoing HTTP requests""" + + return {self.auth_header: self.token} + + @property + @abstractmethod + def auth_header(self) -> str: + """HTTP header to set on the requests""" + + @property + @abstractmethod + def token(self) -> str: + """The header value to set on outgoing HTTP requests""" diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py index a77fa5c730494..d479652f78b8f 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py @@ -1,16 +1,14 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # - -from typing import Any, List, Mapping, MutableMapping, Tuple +from typing import Any, List, Mapping import pendulum -import requests -from requests.auth import AuthBase +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth import AbstractOauth2Authenticator -class Oauth2Authenticator(AuthBase): +class Oauth2Authenticator(AbstractOauth2Authenticator): """ Generates OAuth2.0 access tokens from an OAuth2.0 refresh token and client credentials. The generated access token is attached to each request via the Authorization header. @@ -26,59 +24,54 @@ def __init__( token_expiry_date: pendulum.DateTime = None, access_token_name: str = "access_token", expires_in_name: str = "expires_in", + refresh_request_body: Mapping[str, Any] = None, ): - self.token_refresh_endpoint = token_refresh_endpoint - self.client_secret = client_secret - self.client_id = client_id - self.refresh_token = refresh_token - self.scopes = scopes - self.access_token_name = access_token_name - self.expires_in_name = expires_in_name + self._token_refresh_endpoint = token_refresh_endpoint + self._client_secret = client_secret + self._client_id = client_id + self._refresh_token = refresh_token + self._scopes = scopes + self._access_token_name = access_token_name + self._expires_in_name = expires_in_name + self._refresh_request_body = refresh_request_body self._token_expiry_date = token_expiry_date or pendulum.now().subtract(days=1) self._access_token = None - def __call__(self, request): - request.headers.update(self.get_auth_header()) - return request + def get_token_refresh_endpoint(self) -> str: + return self._token_refresh_endpoint - def get_auth_header(self) -> Mapping[str, Any]: - return {"Authorization": f"Bearer {self.get_access_token()}"} + def get_client_id(self) -> str: + return self._client_id - def get_access_token(self): - if self.token_has_expired(): - t0 = pendulum.now() - token, expires_in = self.refresh_access_token() - self._access_token = token - self._token_expiry_date = t0.add(seconds=expires_in) + def get_client_secret(self) -> str: + return self._client_secret - return self._access_token + def get_refresh_token(self) -> str: + return self._refresh_token + + def get_access_token_name(self) -> str: + return self._access_token_name - def token_has_expired(self) -> bool: - return pendulum.now() > self._token_expiry_date + def get_scopes(self) -> [str]: + return self._scopes + + def get_expires_in_name(self) -> str: + return self._expires_in_name def get_refresh_request_body(self) -> Mapping[str, Any]: - """Override to define additional parameters""" - payload: MutableMapping[str, Any] = { - "grant_type": "refresh_token", - "client_id": self.client_id, - "client_secret": self.client_secret, - "refresh_token": self.refresh_token, - } - - if self.scopes: - payload["scopes"] = self.scopes - - return payload - - def refresh_access_token(self) -> Tuple[str, int]: - """ - returns a tuple of (access_token, token_lifespan_in_seconds) - """ - try: - response = requests.request(method="POST", url=self.token_refresh_endpoint, data=self.get_refresh_request_body()) - response.raise_for_status() - response_json = response.json() - return response_json[self.access_token_name], response_json[self.expires_in_name] - except Exception as e: - raise Exception(f"Error while refreshing access token: {e}") from e + return self._refresh_request_body + + def get_token_expiry_date(self) -> pendulum.DateTime: + return self._token_expiry_date + + def set_token_expiry_date(self, value: pendulum.DateTime): + self._token_expiry_date = value + + @property + def access_token(self) -> str: + return self._access_token + + @access_token.setter + def access_token(self, value: str): + self._access_token = value diff --git a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py index 9bc8c664bfe87..5b15cd923f12a 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/streams/http/requests_native_auth/token.py @@ -1,39 +1,73 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # +import base64 from itertools import cycle -from typing import Any, List, Mapping +from typing import List -from requests.auth import AuthBase +from airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token import AbstractHeaderAuthenticator -class MultipleTokenAuthenticator(AuthBase): +class MultipleTokenAuthenticator(AbstractHeaderAuthenticator): """ Builds auth header, based on the list of tokens provided. Auth header is changed per each `get_auth_header` call, using each token in cycle. The token is attached to each request via the `auth_header` header. """ + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {next(self._tokens_iter)}" + def __init__(self, tokens: List[str], auth_method: str = "Bearer", auth_header: str = "Authorization"): - self.auth_method = auth_method - self.auth_header = auth_header + self._auth_method = auth_method + self._auth_header = auth_header self._tokens = tokens self._tokens_iter = cycle(self._tokens) - def __call__(self, request): - request.headers.update(self.get_auth_header()) - return request - - def get_auth_header(self) -> Mapping[str, Any]: - return {self.auth_header: f"{self.auth_method} {next(self._tokens_iter)}"} - -class TokenAuthenticator(MultipleTokenAuthenticator): +class TokenAuthenticator(AbstractHeaderAuthenticator): """ Builds auth header, based on the token provided. The token is attached to each request via the `auth_header` header. """ + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {self._token}" + def __init__(self, token: str, auth_method: str = "Bearer", auth_header: str = "Authorization"): - super().__init__([token], auth_method, auth_header) + self._auth_header = auth_header + self._auth_method = auth_method + self._token = token + + +class BasicHttpAuthenticator(AbstractHeaderAuthenticator): + """ + Builds auth based off the basic authentication scheme as defined by RFC 7617, which transmits credentials as USER ID/password pairs, encoded using bas64 + https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication#basic_authentication_scheme + """ + + @property + def auth_header(self) -> str: + return self._auth_header + + @property + def token(self) -> str: + return f"{self._auth_method} {self._token}" + + def __init__(self, username: str, password: str = "", auth_method: str = "Basic", auth_header: str = "Authorization"): + auth_string = f"{username}:{password}".encode("utf8") + b64_encoded = base64.b64encode(auth_string).decode("utf8") + self._auth_header = auth_header + self._auth_method = auth_method + self._token = b64_encoded diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py index 3c2a978421e03..59b14416632ff 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/casing.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/catalog_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/catalog_helpers.py index fddf79fa242e5..b1d83cfa2c127 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/catalog_helpers.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/catalog_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py index 1db9ab5843b97..be8e257d600a2 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -7,9 +7,8 @@ import json import os import pkgutil -from typing import Any, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Set, Tuple, Union +from typing import Any, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Tuple, Union -import dpath.util import jsonref from airbyte_cdk.models import ConnectorSpecification from jsonschema import RefResolver, validate @@ -192,32 +191,3 @@ def split_config(config: Mapping[str, Any]) -> Tuple[dict, InternalConfig]: else: main_config[k] = v return main_config, InternalConfig.parse_obj(internal_config) - - -def get_secret_values(schema: Mapping[str, Any], config: Mapping[str, Any]) -> List[str]: - def get_secret_pathes(schema: Mapping[str, Any]) -> Set[str]: - pathes = set() - - def traverse_schema(schema: Any, path: List[str]): - if isinstance(schema, dict): - for k, v in schema.items(): - traverse_schema(v, [*path, k]) - elif isinstance(schema, list): - for i in schema: - traverse_schema(i, path) - else: - if path[-1] == "airbyte_secret" and schema is True: - path = "/".join([p for p in path[:-1] if p not in ["properties", "oneOf"]]) - pathes.add(path) - - traverse_schema(schema, []) - return pathes - - secret_pathes = get_secret_pathes(schema) - result = [] - for path in secret_pathes: - try: - result.append(dpath.util.get(config, path)) - except KeyError: - pass - return result diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_models.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_models.py index 7a36964937036..26d5302170a60 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_models.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/schema_models.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # from typing import Any, Dict, Optional, Type diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/sentry.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/sentry.py deleted file mode 100644 index 14dfd69aafd6b..0000000000000 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/sentry.py +++ /dev/null @@ -1,228 +0,0 @@ -# -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. -# - -import contextlib -import os -import re -from typing import Any, Callable, List, Optional, Type, Union -from uuid import uuid4 - -import sentry_sdk -from sentry_sdk.integrations.atexit import AtexitIntegration -from sentry_sdk.integrations.excepthook import ExcepthookIntegration -from sentry_sdk.integrations.logging import LoggingIntegration - - -class AirbyteSentry: - """ - Class for working with sentry sdk. It provides methods to: - - init sentry sdk based on env variable - - add breadcrumbs and set context - - work with transactions and transaction spans - - set tag and capture message and capture exception - Also it implements client side sensitive data scrubbing. - """ - - DSN_ENV_NAME = "SENTRY_DSN" - SECRET_MASK = "***" - # Maximum number of breadcrumbs to send on fail. Breadcrumbs is trail of - # events that occured before the fail and being sent to server only - # if handled or unhandled exception occured. - MAX_BREADCRUMBS = 30 - # Event sending rate. could be from 0 (0%) to 1.0 (100 % events being sent - # to sentry server) - TRACES_SAMPLE_RATE = 1.0 - SECRET_REGEXP = [ - re.compile("(api_key=)[a-zA-Z0-9_]+"), - re.compile("(access_token=)[a-zA-Z0-9_]+"), - re.compile("(refresh_token=)[a-zA-Z0-9_]+"), - re.compile("(token )[a-zA-Z0-9_]+"), - re.compile("(Bearer )[a-zA-Z0-9_]+"), - ] - SENSITIVE_KEYS = ["Authorization", "client_secret", "access_token"] - - sentry_enabled = False - source_tag = "" - run_id = str(uuid4()) - secret_values: List[str] = [] - - @classmethod - def process_value(cls, key: str, value: str): - """ - Process single value. Used by recursive replace_value method or - standalone for single value. - """ - for secret in cls.secret_values: - value = value.replace(secret, cls.SECRET_MASK) - if key in cls.SENSITIVE_KEYS: - return cls.SECRET_MASK - for regexp in cls.SECRET_REGEXP: - value = regexp.sub(f"\\1{cls.SECRET_MASK}", value) - return value - - @classmethod - def replace_value(cls, key, value): - """ - Recursively scan event and replace all sensitive data with SECRET_MASK. - Perform inplace data replace i.e. its not creating new object. - """ - if isinstance(value, dict): - for k, v in value.items(): - value[k] = cls.replace_value(k, v) - elif isinstance(value, list): - for index, v in enumerate(value): - value[index] = cls.replace_value(index, v) - elif isinstance(value, str): - return cls.process_value(key, value) - return value - - @classmethod - def filter_event(cls, event, hint): - """ - Callback for before_send sentry hook. - """ - if "message" in event: - event["message"] = cls.process_value(None, event["message"]) - cls.replace_value(None, event.get("exception")) - cls.replace_value(None, event.get("contexts")) - return event - - @classmethod - def filter_breadcrumb(cls, event, hint): - """ - Callback for before_breadcrumb sentry hook. - """ - cls.replace_value(None, event) - return event - - @classmethod - def init( - cls, - source_tag: str = None, - transport: Optional[Union[Type[sentry_sdk.transport.Transport], Callable[[Any], None]]] = None, - secret_values: List[str] = [], - ): - """ - Read sentry data source name (DSN) from env variable and initialize sentry cdk. - Args: - source_tag: str - Source name to be used in "source" tag for events organazing. - transport: Transport or Callable - transport object for transfering - sentry event to remote server. Usually used for testing, by default - HTTP transport used - secret_values: List[str] - list of string that have to be filtered - out before sending event to sentry server. - - """ - sentry_dsn = os.environ.get(cls.DSN_ENV_NAME) - if sentry_dsn: - cls.sentry_enabled = True - cls.secret_values = secret_values - sentry_sdk.init( - sentry_dsn, - max_breadcrumbs=cls.MAX_BREADCRUMBS, - traces_sample_rate=cls.TRACES_SAMPLE_RATE, - before_send=AirbyteSentry.filter_event, - before_breadcrumb=AirbyteSentry.filter_breadcrumb, - transport=transport, - # Use only limited list of integration cause sentry may send - # transaction events e.g. it could send httplib request with - # url and authorization info over StdlibIntegration and it - # would bypass before_send hook. - integrations=[ - ExcepthookIntegration(always_run=True), - AtexitIntegration(), - LoggingIntegration(), - ], - # Disable default integrations cause sentry does not allow to - # filter transactions event that could transfer sensitive data - default_integrations=False, - ) - if source_tag: - sentry_sdk.set_tag("source", source_tag) - sentry_sdk.set_tag("run_id", cls.run_id) - cls.source_tag = source_tag - - def if_enabled(f): - def wrapper(cls, *args, **kvargs): - if cls.sentry_enabled: - return f(cls, *args, **kvargs) - - return wrapper - - def if_enabled_else(return_value): - def if_enabled(f): - def wrapper(cls, *args, **kvargs): - if cls.sentry_enabled: - return f(cls, *args, **kvargs) - else: - return return_value - - return wrapper - - return if_enabled - - @classmethod - @if_enabled - def set_tag(cls, tag_name: str, value: Any): - """ - Set tag that is handy for events organazing and filtering by sentry UI. - """ - sentry_sdk.set_tag(tag_name, value) - - @classmethod - @if_enabled - def add_breadcrumb(cls, message, data=None): - """ - Add sentry breadcrumb. - """ - sentry_sdk.add_breadcrumb(message=message, data=data) - - @classmethod - @if_enabled - def set_context(cls, name, data): - # Global context being used by transaction event as well. Since we cant - # filter senstitve data coming from transaction event using sentry - # before_event hook, apply filter to context here. - cls.replace_value(None, data) - sentry_sdk.set_context(name, data) - - @classmethod - @if_enabled - def capture_message(cls, message): - """ - Send message event to sentry. - """ - sentry_sdk.capture_message(message) - - @classmethod - @if_enabled - def capture_exception( - cls, - error: Optional[BaseException] = None, - scope: Optional[Any] = None, - **scope_args, - ): - """ - Report handled execption to sentry. - """ - sentry_sdk.capture_exception(error, scope=scope, **scope_args) - - @classmethod - @if_enabled_else(contextlib.nullcontext()) - def start_transaction(cls, op, name=None): - """ - Return context manager for starting sentry transaction for performance monitoring. - """ - return sentry_sdk.start_transaction(op=op, name=f"{cls.source_tag}.{name}") - - @classmethod - @if_enabled_else(contextlib.nullcontext()) - def start_transaction_span(cls, op, description=None): - """ - Return context manager for starting sentry transaction span inside existing sentry transaction. - """ - # Apply filter to description since we cannot use before_send sentry - # hook for transaction event. - description = cls.replace_value(None, description) - return sentry_sdk.start_span(op=op, description=description) diff --git a/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py b/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py index ed974ef1305b8..1759c316dc172 100644 --- a/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py +++ b/airbyte-cdk/python/airbyte_cdk/sources/utils/transform.py @@ -1,15 +1,15 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # +import logging from distutils.util import strtobool from enum import Flag, auto from typing import Any, Callable, Dict, Mapping, Optional -from airbyte_cdk.logger import AirbyteLogger from jsonschema import Draft7Validator, validators -logger = AirbyteLogger() +logger = logging.getLogger("airbyte") class TransformConfig(Flag): @@ -114,7 +114,7 @@ def default_convert(original_item: Any, subschema: Dict[str, Any]) -> Any: if isinstance(original_item, str): return strtobool(original_item) == 1 return bool(original_item) - except ValueError: + except (ValueError, TypeError): return original_item return original_item @@ -174,4 +174,4 @@ def transform(self, record: Dict[str, Any], schema: Mapping[str, Any]): just calling normalizer.validate() would throw an exception on first validation occurences and stop processing rest of schema. """ - logger.warn(e.message) + logger.warning(e.message) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/__init__.py b/airbyte-cdk/python/airbyte_cdk/utils/__init__.py index e69de29bb2d1d..29556737b88b2 100644 --- a/airbyte-cdk/python/airbyte_cdk/utils/__init__.py +++ b/airbyte-cdk/python/airbyte_cdk/utils/__init__.py @@ -0,0 +1,6 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# +from .traced_exception import AirbyteTracedException + +__all__ = ["AirbyteTracedException"] diff --git a/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py b/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py index 0a64efed7df4f..41e615d628db8 100644 --- a/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py +++ b/airbyte-cdk/python/airbyte_cdk/utils/airbyte_secrets_utils.py @@ -1,20 +1,71 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # -import logging from typing import Any, List, Mapping -from airbyte_cdk.sources import Source -from airbyte_cdk.utils.mapping_utils import all_key_pairs_dot_notation, get_value_by_dot_notation +import dpath.util -def get_secrets(source: Source, config: Mapping[str, Any], logger: logging.Logger) -> List[Any]: +def get_secret_paths(spec: Mapping[str, Any]) -> List[List[str]]: + paths = [] + + def traverse_schema(schema_item: Any, path: List[str]): + """ + schema_item can be any property or value in the originally input jsonschema, depending on how far down the recursion stack we go + path is the path to that schema item in the original input + for example if we have the input {'password': {'type': 'string', 'airbyte_secret': True}} then the arguments will evolve + as follows: + schema_item=, path=[] + schema_item={'type': 'string', 'airbyte_secret': True}, path=['password'] + schema_item='string', path=['password', 'type'] + schema_item=True, path=['password', 'airbyte_secret'] + """ + if isinstance(schema_item, dict): + for k, v in schema_item.items(): + traverse_schema(v, [*path, k]) + elif isinstance(schema_item, list): + for i in schema_item: + traverse_schema(i, path) + else: + if path[-1] == "airbyte_secret" and schema_item is True: + filtered_path = [p for p in path[:-1] if p not in ["properties", "oneOf"]] + paths.append(filtered_path) + + traverse_schema(spec, []) + return paths + + +def get_secrets(connection_specification: Mapping[str, Any], config: Mapping[str, Any]) -> List[Any]: """ - Get a list of secrets from the source config based on the source specification + Get a list of secret values from the source config based on the source specification + :type connection_specification: the connection_specification field of an AirbyteSpecification i.e the JSONSchema definition """ - flattened_key_values = all_key_pairs_dot_notation(source.spec(logger).connectionSpecification.get("properties", {})) - secret_key_names = [ - ".".join(key.split(".")[:1]) for key, value in flattened_key_values.items() if value and key.endswith("airbyte_secret") - ] - return [str(get_value_by_dot_notation(config, key)) for key in secret_key_names if config.get(key)] + secret_paths = get_secret_paths(connection_specification.get("properties", {})) + result = [] + for path in secret_paths: + try: + result.append(dpath.util.get(config, path)) + except KeyError: + # Since we try to get paths to all known secrets in the spec, in the case of oneOfs, some secret fields may not be present + # In that case, a KeyError is thrown. This is expected behavior. + pass + return result + + +__SECRETS_FROM_CONFIG: List[str] = [] + + +def update_secrets(secrets: List[str]): + """Update the list of secrets to be replaced""" + global __SECRETS_FROM_CONFIG + __SECRETS_FROM_CONFIG = secrets + + +def filter_secrets(string: str) -> str: + """Filter secrets from a string by replacing them with ****""" + # TODO this should perform a maximal match for each secret. if "x" and "xk" are both secret values, and this method is called twice on + # the input "xk", then depending on call order it might only obfuscate "*k". This is a bug. + for secret in __SECRETS_FROM_CONFIG: + string = string.replace(str(secret), "****") + return string diff --git a/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py b/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py index 25983c42c71a4..63417df4f39b9 100644 --- a/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py +++ b/airbyte-cdk/python/airbyte_cdk/utils/event_timing.py @@ -1,16 +1,15 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import datetime +import logging import time from contextlib import contextmanager from dataclasses import dataclass, field from typing import Optional -from airbyte_cdk.logger import AirbyteLogger - -logger = AirbyteLogger() +logger = logging.getLogger("airbyte") class EventTimer: @@ -42,7 +41,7 @@ def finish_event(self): event = self.stack.pop(0) event.finish() else: - logger.warn(f"{self.name} finish_event called without start_event") + logger.warning(f"{self.name} finish_event called without start_event") def report(self, order_by="name"): """ diff --git a/airbyte-cdk/python/airbyte_cdk/utils/mapping_utils.py b/airbyte-cdk/python/airbyte_cdk/utils/mapping_utils.py deleted file mode 100644 index c618316afea49..0000000000000 --- a/airbyte-cdk/python/airbyte_cdk/utils/mapping_utils.py +++ /dev/null @@ -1,41 +0,0 @@ -# -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. -# - -from functools import reduce -from typing import Any, Iterable, List, Mapping, Optional, Tuple - - -def all_key_pairs_dot_notation(dict_obj: Mapping) -> Mapping[str, Any]: - """ - Recursively iterate through a dictionary and return a dictionary of all key-value pairs in dot notation. - keys are prefixed with the list of keys passed in as prefix. - """ - - def _all_key_pairs_dot_notation(_dict_obj: Mapping, prefix: List[str] = []) -> Iterable[Tuple[str, Any]]: - for key, value in _dict_obj.items(): - if isinstance(value, dict): - prefix.append(str(key)) - yield from _all_key_pairs_dot_notation(value, prefix) - prefix.pop() - else: - prefix.append(str(key)) - yield ".".join(prefix), value - prefix.pop() - - return {k: v for k, v in _all_key_pairs_dot_notation(dict_obj)} - - -def get_value_by_dot_notation(dict_obj: Mapping, key: str, default: Optional[Any] = ...) -> Any: - """ - Return the value of a key in dot notation in a arbitrarily nested Mapping. - dict_obj: Mapping - key: str - default: Any - raises: KeyError if default is not provided and the key is not found - ex.: - dict_obj = {"nested": {"key": "value"}} - get_value_by_dot_notation(dict_obj, "nested.key") == "value" -> True - """ - - return reduce(lambda d, key_name: d[key_name] if default is ... else d.get(key_name, default), key.split("."), dict_obj) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py b/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py new file mode 100644 index 0000000000000..af8bad293ff02 --- /dev/null +++ b/airbyte-cdk/python/airbyte_cdk/utils/traced_exception.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import traceback +from datetime import datetime + +from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteTraceMessage, FailureType, TraceType +from airbyte_cdk.models import Type as MessageType +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets + + +class AirbyteTracedException(Exception): + """ + An exception that should be emitted as an AirbyteTraceMessage + """ + + def __init__( + self, + internal_message: str = None, + message: str = None, + failure_type: FailureType = FailureType.system_error, + exception: BaseException = None, + ): + """ + :param internal_message: the internal error that caused the failure + :param message: a user-friendly message that indicates the cause of the error + :param failure_type: the type of error + :param exception: the exception that caused the error, from which the stack trace should be retrieved + """ + self.internal_message = internal_message + self.message = message + self.failure_type = failure_type + self._exception = exception + super().__init__(internal_message) + + def as_airbyte_message(self) -> AirbyteMessage: + """ + Builds an AirbyteTraceMessage from the exception + """ + now_millis = datetime.now().timestamp() * 1000.0 + + trace_exc = self._exception or self + stack_trace_str = "".join(traceback.TracebackException.from_exception(trace_exc).format()) + + trace_message = AirbyteTraceMessage( + type=TraceType.ERROR, + emitted_at=now_millis, + error=AirbyteErrorTraceMessage( + message=self.message or "Something went wrong in the connector. See the logs for more details.", + internal_message=self.internal_message, + failure_type=self.failure_type, + stack_trace=stack_trace_str, + ), + ) + + return AirbyteMessage(type=MessageType.TRACE, trace=trace_message) + + def emit_message(self): + """ + Prints the exception as an AirbyteTraceMessage. + Note that this will be called automatically on uncaught exceptions when using the airbyte_cdk entrypoint. + """ + message = self.as_airbyte_message().json(exclude_unset=True) + filtered_message = filter_secrets(message) + print(filtered_message) + + @classmethod + def from_exception(cls, exc: Exception, *args, **kwargs) -> "AirbyteTracedException": + """ + Helper to create an AirbyteTracedException from an existing exception + :param exc: the exception that caused the error + """ + return cls(internal_message=str(exc), exception=exc, *args, **kwargs) diff --git a/airbyte-cdk/python/bin/generate-protocol-files.sh b/airbyte-cdk/python/bin/generate-protocol-files.sh index b268f4d531852..a758093c60b96 100755 --- a/airbyte-cdk/python/bin/generate-protocol-files.sh +++ b/airbyte-cdk/python/bin/generate-protocol-files.sh @@ -4,7 +4,7 @@ set -e [ -z "$ROOT_DIR" ] && exit 1 -YAML_DIR=airbyte-protocol/models/src/main/resources/airbyte_protocol +YAML_DIR=airbyte-protocol/protocol-models/src/main/resources/airbyte_protocol OUTPUT_DIR=airbyte-cdk/python/airbyte_cdk/models function main() { @@ -18,6 +18,7 @@ function main() { docker run --user "$(id -u):$(id -g)" -v "$ROOT_DIR":/airbyte airbyte/code-generator:dev \ --input "/airbyte/$YAML_DIR/$filename_wo_ext.yaml" \ --output "/airbyte/$OUTPUT_DIR/$filename_wo_ext.py" \ + --use-title-as-name \ --disable-timestamp done } diff --git a/airbyte-cdk/python/docs/concepts/README.md b/airbyte-cdk/python/docs/concepts/README.md index cf5f9365232fa..b7daf16cad4df 100644 --- a/airbyte-cdk/python/docs/concepts/README.md +++ b/airbyte-cdk/python/docs/concepts/README.md @@ -1,6 +1,6 @@ # Connector Development Kit Concepts -This concepts section serves as a general introduction to the Python CDK. Readers will certainly benefit from a deeper understanding of the [Airbyte Specification](https://docs.airbyte.io/architecture/airbyte-specification) before proceeding, but we do a quick overview of it in our basic concepts guide below. +This concepts section serves as a general introduction to the Python CDK. Readers will certainly benefit from a deeper understanding of the [Airbyte Specification](https://docs.airbyte.io/architecture/airbyte-protocol) before proceeding, but we do a quick overview of it in our basic concepts guide below. ### Basic Concepts If you want to learn more about the classes required to implement an Airbyte Source, head to our [basic concepts doc](basic-concepts.md). diff --git a/airbyte-cdk/python/docs/concepts/incremental-stream.md b/airbyte-cdk/python/docs/concepts/incremental-stream.md index 37009e3f8fd8a..b9db5260e0c9c 100644 --- a/airbyte-cdk/python/docs/concepts/incremental-stream.md +++ b/airbyte-cdk/python/docs/concepts/incremental-stream.md @@ -1,47 +1,103 @@ -# The Incremental Stream +# Incremental Streams -An incremental Stream is a stream which reads data incrementally. That is, it only reads data that was generated or updated since the last time it ran, and is thus far more efficient than a stream which reads all the source data every time it runs. If possible, developers are encouraged to implement incremental streams to reduce sync times and resource usage. - -Several new pieces are essential to understand how incrementality works with the CDK: +An incremental Stream is a stream which reads data incrementally. That is, it only reads data that was generated or updated since the last time it ran, and is thus far more efficient than a stream which reads all the source data every time it runs. If possible, developers are encouraged to implement incremental streams to reduce sync times and resource usage. + +Several new pieces are essential to understand how incrementality works with the CDK: * `AirbyteStateMessage` * cursor fields -* `Stream.get_updated_state` -as well as a few other optional concepts. +* `IncrementalMixin` +* `Stream.get_updated_state` (deprecated) + + as well as a few other optional concepts. ### `AirbyteStateMessage` -The `AirbyteStateMessage` -persists state between syncs, and allows a new sync to pick up from where the previous sync last finished. See the [incremental sync guide](https://docs.airbyte.io/understanding-airbyte/connections/incremental-append) for more information. +The `AirbyteStateMessage` persists state between syncs, and allows a new sync to pick up from where the previous sync last finished. See the [incremental sync guide](https://docs.airbyte.io/understanding-airbyte/connections/incremental-append) for more information. ### Cursor fields + The `cursor_field` refers to the field in the stream's output records used to determine the "recency" or ordering of records. An example is a `created_at` or `updated_at` field in an API or DB table. -Cursor fields can be input by the user (e.g: a user can choose to use an auto-incrementing `id` column in a DB table) or they can be defined by the source e.g: where an API defines that `updated_at` is what determines the ordering of records. +Cursor fields can be input by the user \(e.g: a user can choose to use an auto-incrementing `id` column in a DB table\) or they can be defined by the source e.g: where an API defines that `updated_at` is what determines the ordering of records. + +In the context of the CDK, setting the `Stream.cursor_field` property to any truthy value informs the framework that this stream is incremental. + +### `IncrementalMixin` + +This class mixin adds property `state` with abstract setter and getter. +The `state` attribute helps the CDK figure out the current state of sync at any moment (in contrast to deprecated `Stream.get_updated_state` method). +The setter typically deserialize state saved by CDK and initialize internal state of the stream. +The getter should serialize internal state of the stream. -In the context of the CDK, setting the `Stream.cursor_field` property to any value informs the framework that this stream is incremental. +```python +@property +def state(self) -> Mapping[str, Any]: + return {self.cursor_field: str(self._cursor_value)} + +@state.setter +def state(self, value: Mapping[str, Any]): + self._cursor_value = value[self.cursor_field] +``` + +The actual logic of updating state during reading is implemented somewhere else, usually as part of `read_records` method, right after the latest record returned that matches the new state. +Therefore, the state represents the latest checkpoint successfully achieved, and all next records should match the next state after that one. +```python +def read_records(self, ...): + ... + yield record + yield record + yield record + self._cursor_value = max(record[self.cursor_field], self._cursor_value) + yield record + yield record + yield record + self._cursor_value = max(record[self.cursor_field], self._cursor_value) +``` ### `Stream.get_updated_state` -This function helps the CDK figure out the latest state for every record output by the connector -(as returned by the `Stream.read_records` method). This allows sync to resume from where the previous sync last stopped, -regardless of success or failure. This function typically compares the state object's and the latest record's cursor field, picking the latest one. +(deprecated since 1.48.0, see `IncrementalMixin`) + +This function helps the stream keep track of the latest state by inspecting every record output by the stream \(as returned by the `Stream.read_records` method\) and comparing it against the most recent state object. This allows sync to resume from where the previous sync last stopped, regardless of success or failure. This function typically compares the state object's and the latest record's cursor field, picking the latest one. + +## Checkpointing state + +There are two ways to checkpointing state \(i.e: controlling the timing of when state is saved\) while reading data from a connector: + +1. Interval-based checkpointing +2. Stream Slices + +### Interval based checkpointing + +This is the simplest method for checkpointing. When the interval is set to a truthy value e.g: 100, then state is persisted after every 100 records output by the connector e.g: state is saved after reading 100 records, then 200, 300, etc.. + +While this is very simple, **it requires that records are output in ascending order with regards to the cursor field**. For example, if your stream outputs records in ascending order of the `updated_at` field, then this is a good fit for your usecase. But if the stream outputs records in a random order, then you cannot use this method because we can only be certain that we read records after a particular `updated_at` timestamp once all records have been fully read. + +Interval based checkpointing can be implemented by setting the `Stream.state_checkpoint_interval` property e.g: + +```text +class MyAmazingStream(Stream): + # Save the state every 100 records + state_checkpoint_interval = 100 +``` ### `Stream.stream_slices` -The above methods can optionally be paired with the `stream_slices` function to granularly control exactly when state is saved. Conceptually, a Stream Slice is a subset of the records in a stream which represent the smallest unit of data which can be re-synced. Once a full slice is read, an `AirbyteStateMessage` will be output, causing state to be saved. If a connector fails while reading the Nth slice of a stream, then the next time it retries, it will begin reading at the beginning of the Nth slice again, rather than re-read slices `1...N-1`. -A Slice object is not typed, and the developer is free to include any information necessary to make the request. This function is called when the `Stream` is about to be read. Typically, the `stream_slices` function, via inspecting the state object, -generates a Slice for every request to be made. +Stream slices can be used to achieve finer grain control of when state is checkpointed. -As an example, suppose an API is able to dispense data hourly. If the last sync was exactly 24 hours ago, -we can either make an API call retrieving all data at once, or make 24 calls each retrieving an hour's -worth of data. In the latter case, the `stream_slices` function, sees that the previous state contains -yesterday's timestamp, and returns a list of 24 Slices, each with a different hourly timestamp to be -used when creating request. If the stream fails halfway through (at the 12th slice), then the next time it starts reading, it will read from the beginning of the 12th slice. +Conceptually, a Stream Slice is a subset of the records in a stream which represent the smallest unit of data which can be re-synced. Once a full slice is read, an `AirbyteStateMessage` will be output, causing state to be saved. If a connector fails while reading the Nth slice of a stream, then the next time it retries, it will begin reading at the beginning of the Nth slice again, rather than re-read slices `1...N-1`. -For a more in-depth description of stream slicing, see the [Stream Slices guide](./stream_slices.md). +A Slice object is not typed, and the developer is free to include any information necessary to make the request. This function is called when the `Stream` is about to be read. Typically, the `stream_slices` function, via inspecting the state object, generates a Slice for every request to be made. + +As an example, suppose an API is able to dispense data hourly. If the last sync was exactly 24 hours ago, we can either make an API call retrieving all data at once, or make 24 calls each retrieving an hour's worth of data. In the latter case, the `stream_slices` function, sees that the previous state contains yesterday's timestamp, and returns a list of 24 Slices, each with a different hourly timestamp to be used when creating request. If the stream fails halfway through \(at the 12th slice\), then the next time it starts reading, it will read from the beginning of the 12th slice. + +For a more in-depth description of stream slicing, see the [Stream Slices guide](https://github.com/airbytehq/airbyte/tree/8500fef4133d3d06e16e8b600d65ebf2c58afefd/docs/connector-development/cdk-python/stream-slices.md). + +## Conclusion -## Conclusion In summary, an incremental stream requires: + * the `cursor_field` property -* the `get_updated_state` function +* to be inherited from `IncrementalMixin` and state methods implemented * Optionally, the `stream_slices` function + diff --git a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-any-percent/cdk-speedrun.md b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-any-percent/cdk-speedrun.md index ac1bcfdce43ee..38bd872b767f7 100644 --- a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-any-percent/cdk-speedrun.md +++ b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-any-percent/cdk-speedrun.md @@ -4,7 +4,7 @@ This is a blazing fast guide to building an HTTP source connector. Think of it a # Dependencies -1. Python >= 3.7 +1. Python >= 3.9 2. Docker 3. NodeJS @@ -42,7 +42,6 @@ We're working with the Exchange Rates API, so we need to define our input schema "title": "Python Http Tutorial Spec", "type": "object", "required": ["start_date", "currency_base"], - "additionalProperties": false, "properties": { "start_date": { "type": "string", diff --git a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/0-getting-started.md b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/0-getting-started.md index cbee09971d102..a1bc7b2227253 100644 --- a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/0-getting-started.md +++ b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/0-getting-started.md @@ -6,11 +6,11 @@ This is a step-by-step guide for how to create an Airbyte source in Python to re ## Requirements -* Python >= 3.7 +* Python >= 3.9 * Docker * NodeJS \(only used to generate the connector\). We'll remove the NodeJS dependency soon. -All the commands below assume that `python` points to a version of python >=3.7.0. On some systems, `python` points to a Python2 installation and `python3` points to Python3. If this is the case on your machine, substitute all `python` commands in this guide with `python3`. +All the commands below assume that `python` points to a version of python >=3.9.0. On some systems, `python` points to a Python2 installation and `python3` points to Python3. If this is the case on your machine, substitute all `python` commands in this guide with `python3`. ## Checklist diff --git a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/2-install-dependencies.md b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/2-install-dependencies.md index 468713097ad4f..ba26e7799f485 100644 --- a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/2-install-dependencies.md +++ b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/2-install-dependencies.md @@ -20,7 +20,7 @@ python main.py spec You should see some output: ```text -{"type": "SPEC", "spec": {"documentationUrl": "https://docsurl.com", "connectionSpecification": {"$schema": "http://json-schema.org/draft-07/schema#", "title": "Python Http Tutorial Spec", "type": "object", "required": ["TODO"], "additionalProperties": false, "properties": {"TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.": {"type": "string", "description": "describe me"}}}}} +{"type": "SPEC", "spec": {"documentationUrl": "https://docsurl.com", "connectionSpecification": {"$schema": "http://json-schema.org/draft-07/schema#", "title": "Python Http Tutorial Spec", "type": "object", "required": ["TODO"], "properties": {"TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.": {"type": "string", "description": "describe me"}}}}} ``` We just ran Airbyte Protocol's `spec` command! We'll talk more about this later, but this is a simple sanity check to make sure everything is wired up correctly. diff --git a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/3-define-inputs.md b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/3-define-inputs.md index 5f5ce8b76e11a..e5c3465deb655 100644 --- a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/3-define-inputs.md +++ b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/3-define-inputs.md @@ -4,7 +4,7 @@ Each connector declares the inputs it needs to read data from the underlying dat The simplest way to implement this is by creating a `.json` file in `source_/spec.json` which describes your connector's inputs according to the [ConnectorSpecification](https://github.com/airbytehq/airbyte/blob/master/airbyte-protocol/models/src/main/resources/airbyte_protocol/airbyte_protocol.yaml#L211) schema. This is a good place to start when developing your source. Using JsonSchema, define what the inputs are \(e.g. username and password\). Here's [an example](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-freshdesk/source_freshdesk/spec.json) of what the `spec.json` looks like for the Freshdesk API source. -For more details on what the spec is, you can read about the Airbyte Protocol [here](https://docs.airbyte.io/understanding-airbyte/airbyte-specification). +For more details on what the spec is, you can read about the Airbyte Protocol [here](https://docs.airbyte.io/understanding-airbyte/airbyte-protocol). The generated code that Airbyte provides, handles implementing the `spec` method for you. It assumes that there will be a file called `spec.json` in the same directory as `source.py`. If you have declared the necessary JsonSchema in `spec.json` you should be done with this step. @@ -18,7 +18,6 @@ Given that we'll pulling currency data for our example source, we'll define the "title": "Python Http Tutorial Spec", "type": "object", "required": ["start_date", "currency_base"], - "additionalProperties": false, "properties": { "start_date": { "type": "string", diff --git a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/6-read-data.md b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/6-read-data.md index 9bf038268a3ed..5a085d9b8db51 100644 --- a/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/6-read-data.md +++ b/airbyte-cdk/python/docs/tutorials/cdk-tutorial-python-http/6-read-data.md @@ -24,9 +24,9 @@ Optionally, we can provide additional inputs to customize requests: Backoff policy options: -- `retry_factor` Specifies factor for exponential backoff policy (by default is 5) -- `max_retries` Specifies maximum amount of retries for backoff policy (by default is 5) -- `raise_on_http_errors` If set to False, allows opting-out of raising HTTP code exception (by default is True) +* `retry_factor` Specifies factor for exponential backoff policy \(by default is 5\) +* `max_retries` Specifies maximum amount of retries for backoff policy \(by default is 5\) +* `raise_on_http_errors` If set to False, allows opting-out of raising HTTP code exception \(by default is True\) There are many other customizable options - you can find them in the [`airbyte_cdk.sources.streams.http.HttpStream`](https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py) class. @@ -37,9 +37,9 @@ Let's begin by pulling data for the last day's rates by using the `/latest` endp ```python class ExchangeRates(HttpStream): url_base = "https://api.exchangeratesapi.io/" - + primary_key = None - + def __init__(self, base: str, **kwargs): super().__init__() self.base = base @@ -85,7 +85,7 @@ This may look big, but that's just because there are lots of \(unused, for now\) Let's also pass the `base` parameter input by the user to the stream class: ```python -def streams(self, config: Mapping[str, Any]) -> List[Stream]: + def streams(self, config: Mapping[str, Any]) -> List[Stream]: auth = NoAuth() return [ExchangeRates(authenticator=auth, base=config['base'])] ``` @@ -110,7 +110,14 @@ We theoretically _could_ stop here and call it a connector. But let's give addin ## Adding incremental sync -To add incremental sync, we'll do a few things: 1. Pass the `start_date` param input by the user into the stream. 2. Declare the stream's `cursor_field`. 3. Implement the `get_updated_state` method. 4. Implement the `stream_slices` method. 5. Update the `path` method to specify the date to pull exchange rates for. 6. Update the configured catalog to use `incremental` sync when we're testing the stream. +To add incremental sync, we'll do a few things: +1. Pass the `start_date` param input by the user into the stream. +2. Declare the stream's `cursor_field`. +3. Declare the stream's property `_cursor_value` to hold the state value +4. Add `IncrementalMixin` to the list of the ancestors of the stream and implement setter and getter of the `state`. +5. Implement the `stream_slices` method. +6. Update the `path` method to specify the date to pull exchange rates for. +7. Update the configured catalog to use `incremental` sync when we're testing the stream. We'll describe what each of these methods do below. Before we begin, it may help to familiarize yourself with how incremental sync works in Airbyte by reading the [docs on incremental](https://docs.airbyte.io/architecture/connections/incremental-append). @@ -132,7 +139,7 @@ Let's also add this parameter to the constructor and declare the `cursor_field`: from datetime import datetime, timedelta -class ExchangeRates(HttpStream): +class ExchangeRates(HttpStream, IncrementalMixin): url_base = "https://api.exchangeratesapi.io/" cursor_field = "date" primary_key = "date" @@ -141,24 +148,38 @@ class ExchangeRates(HttpStream): super().__init__() self.base = base self.start_date = start_date + self._cursor_value = None ``` Declaring the `cursor_field` informs the framework that this stream now supports incremental sync. The next time you run `python main_dev.py discover --config sample_files/config.json` you'll find that the `supported_sync_modes` field now also contains `incremental`. But we're not quite done with supporting incremental, we have to actually emit state! We'll structure our state object very simply: it will be a `dict` whose single key is `'date'` and value is the date of the last day we synced data from. For example, `{'date': '2021-04-26'}` indicates the connector previously read data up until April 26th and therefore shouldn't re-read anything before April 26th. -Let's do this by implementing the `get_updated_state` method inside the `ExchangeRates` class. +Let's do this by implementing the getter and setter for the `state` inside the `ExchangeRates` class. ```python - def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, any]: - # This method is called once for each record returned from the API to compare the cursor field value in that record with the current state - # we then return an updated state object. If this is the first time we run a sync or no state was passed, current_stream_state will be None. - if current_stream_state is not None and 'date' in current_stream_state: - current_parsed_date = datetime.strptime(current_stream_state['date'], '%Y-%m-%d') - latest_record_date = datetime.strptime(latest_record['date'], '%Y-%m-%d') - return {'date': max(current_parsed_date, latest_record_date).strftime('%Y-%m-%d')} + @property + def state(self) -> Mapping[str, Any]: + if self._cursor_value: + return {self.cursor_field: self._cursor_value.strftime('%Y-%m-%d')} else: - return {'date': self.start_date.strftime('%Y-%m-%d')} + return {self.cursor_field: self.start_date.strftime('%Y-%m-%d')} + + @state.setter + def state(self, value: Mapping[str, Any]): + self._cursor_value = datetime.strptime(value[self.cursor_field], '%Y-%m-%d') +``` + +Update internal state `cursor_value` inside `read_records` method + +```python + def read_records(self, *args, **kwargs) -> Iterable[Mapping[str, Any]]: + for record in super().read_records(*args, **kwargs): + if self._cursor_value: + latest_record_date = datetime.strptime(latest_record[self.cursor_field], '%Y-%m-%d') + self._cursor_value = max(self._cursor_value, latest_record_date) + yield record + ``` This implementation compares the date from the latest record with the date in the current state and takes the maximum as the "new" state object. @@ -166,20 +187,19 @@ This implementation compares the date from the latest record with the date in th We'll implement the `stream_slices` method to return a list of the dates for which we should pull data based on the stream state if it exists: ```python - def _chunk_date_range(self, start_date: datetime) -> List[Mapping[str, any]]: + def _chunk_date_range(self, start_date: datetime) -> List[Mapping[str, Any]]: """ Returns a list of each day between the start date and now. The return value is a list of dicts {'date': date_string}. """ dates = [] while start_date < datetime.now(): - dates.append({'date': start_date.strftime('%Y-%m-%d')}) + dates.append({self.cursor_field: start_date.strftime('%Y-%m-%d')}) start_date += timedelta(days=1) return dates - def stream_slices(self, sync_mode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None) -> Iterable[ - Optional[Mapping[str, any]]]: - start_date = datetime.strptime(stream_state['date'], '%Y-%m-%d') if stream_state and 'date' in stream_state else self.start_date + def stream_slices(self, sync_mode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None) -> Iterable[Optional[Mapping[str, Any]]]: + start_date = datetime.strptime(stream_state[self.cursor_field], '%Y-%m-%d') if stream_state and self.cursor_field in stream_state else self.start_date return self._chunk_date_range(start_date) ``` @@ -222,3 +242,4 @@ You should see that only the record from the last date is being synced! This is With that, we've implemented incremental sync for our connector! +```` diff --git a/airbyte-cdk/python/docs/tutorials/http_api_source.md b/airbyte-cdk/python/docs/tutorials/http_api_source.md index 400de7fad0e32..386c3fd8edc79 100644 --- a/airbyte-cdk/python/docs/tutorials/http_api_source.md +++ b/airbyte-cdk/python/docs/tutorials/http_api_source.md @@ -7,11 +7,11 @@ Exchangerates API as an example since it is both simple but demonstrates a lot o ## Requirements -* Python >= 3.7 +* Python >= 3.9 * Docker * NodeJS (only used to generate the connector). We'll remove the NodeJS dependency soon. -All the commands below assume that `python` points to a version of python >=3.7.0. On some systems, `python` points to a Python2 installation and `python3` points to Python3. If this is the case on your machine, substitute all `python` commands in this guide with `python3`. +All the commands below assume that `python` points to a version of python >=3.9. On some systems, `python` points to a Python2 installation and `python3` points to Python3. If this is the case on your machine, substitute all `python` commands in this guide with `python3`. ## Checklist * Step 1: Create the source using the template @@ -60,7 +60,7 @@ python main_dev.py spec You should see some output: ``` -{"type": "SPEC", "spec": {"documentationUrl": "https://docsurl.com", "connectionSpecification": {"$schema": "http://json-schema.org/draft-07/schema#", "title": "Python Http Tutorial Spec", "type": "object", "required": ["TODO"], "additionalProperties": false, "properties": {"TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.": {"type": "string", "description": "describe me"}}}}} +{"type": "SPEC", "spec": {"documentationUrl": "https://docsurl.com", "connectionSpecification": {"$schema": "http://json-schema.org/draft-07/schema#", "title": "Python Http Tutorial Spec", "type": "object", "required": ["TODO"], "properties": {"TODO: This schema defines the configuration required for the source. This usually involves metadata such as database and/or authentication information.": {"type": "string", "description": "describe me"}}}}} ``` We just ran Airbyte Protocol's `spec` command! We'll talk more about this later, but this is a simple sanity check to make sure everything is wired up correctly. @@ -119,7 +119,7 @@ Each connector declares the inputs it needs to read data from the underlying dat The simplest way to implement this is by creating a `.json` file in `source_/spec.json` which describes your connector's inputs according to the [ConnectorSpecification](https://github.com/airbytehq/airbyte/blob/master/airbyte-protocol/models/src/main/resources/airbyte_protocol/airbyte_protocol.yaml#L211) schema. This is a good place to start when developing your source. Using JsonSchema, define what the inputs are \(e.g. username and password\). Here's [an example](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-freshdesk/source_freshdesk/spec.json) of what the `spec.json` looks like for the Freshdesk API source. -For more details on what the spec is, you can read about the Airbyte Protocol [here](https://docs.airbyte.io/understanding-airbyte/airbyte-specification#the-airbyte-protocol). +For more details on what the spec is, you can read about the Airbyte Protocol [here](https://docs.airbyte.io/understanding-airbyte/airbyte-protocol#the-airbyte-protocol). The generated code that Airbyte provides, handles implementing the `spec` method for you. It assumes that there will be a file called `spec.json` in the same directory as `source.py`. If you have declared the necessary JsonSchema in `spec.json` you should be done with this step. @@ -133,7 +133,6 @@ Given that we'll pulling currency data for our example source, we'll define the "title": "Python Http Tutorial Spec", "type": "object", "required": ["start_date", "currency_base"], - "additionalProperties": false, "properties": { "start_date": { "type": "string", @@ -312,7 +311,8 @@ Backoff policy options: - `max_retries` Specifies maximum amount of retries for backoff policy (by default is 5) - `raise_on_http_errors` If set to False, allows opting-out of raising HTTP code exception (by default is True) -There are many other customizable options - you can find them in the [`base_python.cdk.streams.http.HttpStream`](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/bases/base-python/base_python/cdk/streams/http.py) class. + +There are many other customizable options - you can find them in the [`airbyte_cdk.sources.streams.http.HttpStream`](https://github.com/airbytehq/airbyte/blob/master/airbyte-cdk/python/airbyte_cdk/sources/streams/http/http.py) class. So in order to read data from the exchange rates API, we'll fill out the necessary information for the stream to do its work. First, we'll implement a basic read that just reads the last day's exchange rates, then we'll implement incremental sync using stream slicing. @@ -396,13 +396,15 @@ There we have it - a stream which reads data in just a few lines of code! We theoretically _could_ stop here and call it a connector. But let's give adding incremental sync a shot. #### Adding incremental sync + To add incremental sync, we'll do a few things: -1. Pass the `start_date` param input by the user into the stream. -2. Declare the stream's `cursor_field`. -3. Implement the `get_updated_state` method. -4. Implement the `stream_slices` method. -5. Update the `path` method to specify the date to pull exchange rates for. -6. Update the configured catalog to use `incremental` sync when we're testing the stream. +1. Pass the `start_date` param input by the user into the stream. +2. Declare the stream's `cursor_field`. +3. Declare the stream's property `_cursor_value` to hold the state value +4. Add `IncrementalMixin` to the list of the ancestors of the stream and implement setter and getter of the `state`. +5. Implement the `stream_slices` method. +6. Update the `path` method to specify the date to pull exchange rates for. +7. Update the configured catalog to use `incremental` sync when we're testing the stream. We'll describe what each of these methods do below. Before we begin, it may help to familiarize yourself with how incremental sync works in Airbyte by reading the [docs on incremental](https://docs.airbyte.io/architecture/connections/incremental-append). @@ -424,7 +426,7 @@ Let's also add this parameter to the constructor and declare the `cursor_field`: from datetime import datetime, timedelta -class ExchangeRates(HttpStream): +class ExchangeRates(HttpStream, IncrementalMixin): url_base = "https://api.exchangeratesapi.io/" cursor_field = "date" @@ -432,25 +434,39 @@ class ExchangeRates(HttpStream): super().__init__() self.base = base self.start_date = start_date + self._cursor_value = None ``` Declaring the `cursor_field` informs the framework that this stream now supports incremental sync. The next time you run `python main_dev.py discover --config sample_files/config.json` you'll find that the `supported_sync_modes` field now also contains `incremental`. But we're not quite done with supporting incremental, we have to actually emit state! We'll structure our state object very simply: it will be a `dict` whose single key is `'date'` and value is the date of the last day we synced data from. For example, `{'date': '2021-04-26'}` indicates the connector previously read data up until April 26th and therefore shouldn't re-read anything before April 26th. -Let's do this by implementing the `get_updated_state` method inside the `ExchangeRates` class. +Let's do this by implementing the getter and setter for the `state` inside the `ExchangeRates` class. ```python - def get_updated_state(self, current_stream_state: MutableMapping[str, Any], latest_record: Mapping[str, Any]) -> Mapping[str, any]: - # This method is called once for each record returned from the API to compare the cursor field value in that record with the current state - # we then return an updated state object. If this is the first time we run a sync or no state was passed, current_stream_state will be None. - if current_stream_state is not None and 'date' in current_stream_state: - current_parsed_date = datetime.strptime(current_stream_state['date'], '%Y-%m-%d') - latest_record_date = datetime.strptime(latest_record['date'], '%Y-%m-%d') - return {'date': max(current_parsed_date, latest_record_date).strftime('%Y-%m-%d')} + @property + def state(self) -> Mapping[str, Any]: + if self._cursor_value: + return {self.cursor_field: self._cursor_value.strftime('%Y-%m-%d')} else: - return {'date': self.start_date.strftime('%Y-%m-%d')} -``` + return {self.cursor_field: self.start_date.strftime('%Y-%m-%d')} + + @state.setter + def state(self, value: Mapping[str, Any]): + self._cursor_value = datetime.strptime(value[self.cursor_field], '%Y-%m-%d') +``` + +Update internal state `cursor_value` inside `read_records` method + +```python + def read_records(self, *args, **kwargs) -> Iterable[Mapping[str, Any]]: + for record in super().read_records(*args, **kwargs): + if self._cursor_value: + latest_record_date = datetime.strptime(latest_record[self.cursor_field], '%Y-%m-%d') + self._cursor_value = max(self._cursor_value, latest_record_date) + yield record + +``` This implementation compares the date from the latest record with the date in the current state and takes the maximum as the "new" state object. diff --git a/airbyte-cdk/python/docs/tutorials/http_api_source_assets/configured_catalog.json b/airbyte-cdk/python/docs/tutorials/http_api_source_assets/configured_catalog.json index 66ab9be9e7bb5..7aa9a7e9b2229 100644 --- a/airbyte-cdk/python/docs/tutorials/http_api_source_assets/configured_catalog.json +++ b/airbyte-cdk/python/docs/tutorials/http_api_source_assets/configured_catalog.json @@ -7,6 +7,9 @@ "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { + "access_key": { + "type": "string" + }, "base": { "type": "string" }, diff --git a/airbyte-cdk/python/docs/tutorials/http_api_source_assets/exchange_rates.json b/airbyte-cdk/python/docs/tutorials/http_api_source_assets/exchange_rates.json index 7476b088094e2..9462ce0079e6e 100644 --- a/airbyte-cdk/python/docs/tutorials/http_api_source_assets/exchange_rates.json +++ b/airbyte-cdk/python/docs/tutorials/http_api_source_assets/exchange_rates.json @@ -2,6 +2,9 @@ "type": "object", "required": ["base", "date", "rates"], "properties": { + "access_key": { + "type": "string" + }, "base": { "type": "string" }, diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst index 7cc0db2ed60a9..4d270b65e6ee9 100644 --- a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.rst @@ -8,6 +8,7 @@ Subpackages airbyte_cdk.destinations airbyte_cdk.models airbyte_cdk.sources + airbyte_cdk.utils Submodules ---------- @@ -28,6 +29,14 @@ airbyte\_cdk.entrypoint module :undoc-members: :show-inheritance: +airbyte\_cdk.exception\_handler module +-------------------------------------- + +.. automodule:: airbyte_cdk.exception_handler + :members: + :undoc-members: + :show-inheritance: + airbyte\_cdk.logger module -------------------------- diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst new file mode 100644 index 0000000000000..43c8a518e5d79 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.auth.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.auth.oauth module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth.oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.auth.token module +-------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth.token + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.auth + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst new file mode 100644 index 0000000000000..d4d275419f546 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.checks.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.checks.check\_stream module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.checks.check_stream + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.checks.connection\_checker module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.checks.connection_checker + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.checks + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst new file mode 100644 index 0000000000000..7cd9ebae47ca7 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.datetime.rst @@ -0,0 +1,19 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.datetime.min\_max\_datetime module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.datetime.min_max_datetime + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.datetime + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst new file mode 100644 index 0000000000000..3d4a362b1064a --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.decoders.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.decoders.decoder module +-------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders.decoder + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.decoders.json\_decoder module +-------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders.json_decoder + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.decoders + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst new file mode 100644 index 0000000000000..507b25296fe4a --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.extractors.rst @@ -0,0 +1,43 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.extractors.http\_selector module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.http_selector + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.jello module +-------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.jello + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.record\_filter module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.record_filter + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.extractors.record\_selector module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors.record_selector + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.extractors + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst new file mode 100644 index 0000000000000..22ab8838517b5 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.interpolation.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_boolean module +--------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_boolean + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_mapping module +--------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_mapping + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolated\_string module +-------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolated_string + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.interpolation module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.interpolation + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.jinja module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.jinja + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.interpolation.macros module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.interpolation.macros + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.interpolation + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst new file mode 100644 index 0000000000000..c5f9fdb8b8ecd --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.parsers.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.parsers.class\_types\_registry module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.class_types_registry + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.config\_parser module +-------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.config_parser + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.default\_implementation\_registry module +--------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.default_implementation_registry + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.factory module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.factory + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.undefined\_reference\_exception module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers.undefined_reference_exception + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.parsers.yaml\_parser module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.parsers.yaml_parser + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst new file mode 100644 index 0000000000000..0fa4a8c4070d2 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.rst @@ -0,0 +1,51 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.constant\_backoff\_strategy module +------------------------------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.exponential\_backoff\_strategy module +--------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.header\_helper module +----------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.wait\_time\_from\_header\_backoff\_strategy module +---------------------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategies.wait\_until\_time\_from\_header\_backoff\_strategy module +----------------------------------------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst new file mode 100644 index 0000000000000..5b69c8b19ce79 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.error_handlers.rst @@ -0,0 +1,75 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.backoff\_strategy module +------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.composite\_error\_handler module +-------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.default\_error\_handler module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.error\_handler module +--------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.error_handler + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.http\_response\_filter module +----------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.response\_action module +----------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.response_action + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.error\_handlers.response\_status module +----------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers.response_status + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.error_handlers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst new file mode 100644 index 0000000000000..983bada8ee010 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.rst @@ -0,0 +1,51 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.paginators.strategies + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.paginators.limit\_paginator module +------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.no\_pagination module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.no_pagination + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.pagination\_strategy module +---------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.pagination_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.paginator module +----------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.paginator + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst new file mode 100644 index 0000000000000..98c5c02182250 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.paginators.strategies.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.cursor\_pagination\_strategy module +----------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.offset\_increment module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.paginators.strategies.page\_increment module +---------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.paginators.strategies + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst new file mode 100644 index 0000000000000..ebb492b2a50b7 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_headers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.request\_headers.interpolated\_request\_header\_provider module +----------------------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers.interpolated_request_header_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_headers.request\_header\_provider module +--------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers.request_header_provider + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_headers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst new file mode 100644 index 0000000000000..454e6c2af4bb4 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.request_options.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.request\_options.interpolated\_request\_options\_provider module +------------------------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_options.request\_options\_provider module +---------------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options.request_options_provider + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_options + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst new file mode 100644 index 0000000000000..65e58aec2e736 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.retriers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.retriers.default\_retrier module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers.default_retrier + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.retriers.retrier module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers.retrier + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.retriers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst new file mode 100644 index 0000000000000..7ed1b26911309 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.requesters.rst @@ -0,0 +1,53 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.requesters.error_handlers + airbyte_cdk.sources.declarative.requesters.paginators + airbyte_cdk.sources.declarative.requesters.request_options + +Submodules +---------- + +airbyte\_cdk.sources.declarative.requesters.http\_requester module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.http_requester + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.interpolated\_request\_input\_provider module +----------------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters.interpolated_request_input_provider + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.request\_option module +------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.request_option + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.requesters.requester module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.requesters.requester + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.requesters + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst new file mode 100644 index 0000000000000..763c663648a95 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.retrievers.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.retrievers.retriever module +------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.retrievers.retriever + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.retrievers.simple\_retriever module +-------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.retrievers.simple_retriever + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.retrievers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst new file mode 100644 index 0000000000000..97b718996eba8 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.rst @@ -0,0 +1,78 @@ + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + airbyte_cdk.sources.declarative.auth + airbyte_cdk.sources.declarative.checks + airbyte_cdk.sources.declarative.datetime + airbyte_cdk.sources.declarative.decoders + airbyte_cdk.sources.declarative.extractors + airbyte_cdk.sources.declarative.interpolation + airbyte_cdk.sources.declarative.parsers + airbyte_cdk.sources.declarative.requesters + airbyte_cdk.sources.declarative.retrievers + airbyte_cdk.sources.declarative.schema + airbyte_cdk.sources.declarative.stream_slicers + airbyte_cdk.sources.declarative.transformations + +Submodules +---------- + +airbyte\_cdk.sources.declarative.create\_partial module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.create_partial + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.declarative\_source module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.declarative_source + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.declarative\_stream module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.declarative_stream + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.read\_exception module +------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.read_exception + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.types module +--------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.types + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.yaml\_declarative\_source module +----------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.yaml_declarative_source + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst new file mode 100644 index 0000000000000..d0da7b6a71279 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.schema.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.schema.json\_schema module +----------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema.json_schema + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.schema.schema\_loader module +------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema.schema_loader + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.schema + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst new file mode 100644 index 0000000000000..f4d331f8d3dab --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.states.rst @@ -0,0 +1,27 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.states.dict\_state module +---------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.states.dict_state + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.states.state module +---------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.states.state + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.states + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst new file mode 100644 index 0000000000000..ed5f6c179977a --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.stream_slicers.rst @@ -0,0 +1,59 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.stream\_slicers.cartesian\_product\_stream\_slicer module +------------------------------------------------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.datetime\_stream\_slicer module +-------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.list\_stream\_slicer module +---------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.single\_slice module +--------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.single_slice + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.stream\_slicer module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.stream_slicer + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.stream\_slicers.substream\_slicer module +------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers.substream_slicer + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.stream_slicers + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst new file mode 100644 index 0000000000000..031b1af23d2c3 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.declarative.transformations.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.sources.declarative.transformations.add\_fields module +------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.add_fields + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.transformations.remove\_fields module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.remove_fields + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.declarative.transformations.transformation module +---------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations.transformation + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.sources.declarative.transformations + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst index 6ae9aa5b4e2fe..0a25c34ae005b 100644 --- a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.rst @@ -5,6 +5,7 @@ Subpackages .. toctree:: :maxdepth: 4 + airbyte_cdk.sources.declarative airbyte_cdk.sources.deprecated airbyte_cdk.sources.singer airbyte_cdk.sources.streams @@ -21,6 +22,14 @@ airbyte\_cdk.sources.abstract\_source module :undoc-members: :show-inheritance: +airbyte\_cdk.sources.config module +---------------------------------- + +.. automodule:: airbyte_cdk.sources.config + :members: + :undoc-members: + :show-inheritance: + airbyte\_cdk.sources.source module ---------------------------------- diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst index 74f3e41827e8b..b2a7bc7d8ec53 100644 --- a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.streams.http.requests_native_auth.rst @@ -2,6 +2,22 @@ Submodules ---------- +airbyte\_cdk.sources.streams.http.requests\_native\_auth.abstract\_oauth module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.abstract_oauth + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.sources.streams.http.requests\_native\_auth.abstract\_token module +------------------------------------------------------------------------------- + +.. automodule:: airbyte_cdk.sources.streams.http.requests_native_auth.abstract_token + :members: + :undoc-members: + :show-inheritance: + airbyte\_cdk.sources.streams.http.requests\_native\_auth.oauth module --------------------------------------------------------------------- diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst index a435a52f1664d..8b53a23a7199d 100644 --- a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.sources.utils.rst @@ -26,6 +26,14 @@ airbyte\_cdk.sources.utils.schema\_helpers module :undoc-members: :show-inheritance: +airbyte\_cdk.sources.utils.schema\_models module +------------------------------------------------ + +.. automodule:: airbyte_cdk.sources.utils.schema_models + :members: + :undoc-members: + :show-inheritance: + airbyte\_cdk.sources.utils.transform module ------------------------------------------- diff --git a/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst new file mode 100644 index 0000000000000..d65a8f8ec0553 --- /dev/null +++ b/airbyte-cdk/python/reference_docs/_source/api/airbyte_cdk.utils.rst @@ -0,0 +1,35 @@ + +Submodules +---------- + +airbyte\_cdk.utils.airbyte\_secrets\_utils module +------------------------------------------------- + +.. automodule:: airbyte_cdk.utils.airbyte_secrets_utils + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.utils.event\_timing module +--------------------------------------- + +.. automodule:: airbyte_cdk.utils.event_timing + :members: + :undoc-members: + :show-inheritance: + +airbyte\_cdk.utils.traced\_exception module +------------------------------------------- + +.. automodule:: airbyte_cdk.utils.traced_exception + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: airbyte_cdk.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/airbyte-cdk/python/reference_docs/_source/conf.py b/airbyte-cdk/python/reference_docs/_source/conf.py index 6415fe5195d7f..5ce9636934f6c 100644 --- a/airbyte-cdk/python/reference_docs/_source/conf.py +++ b/airbyte-cdk/python/reference_docs/_source/conf.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # # Configuration file for the Sphinx documentation builder. @@ -32,7 +32,10 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. -extensions = ["sphinx.ext.autodoc"] # API docs +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", # Support for NumPy and Google style docstrings +] # API docs source_suffix = {".rst": "restructuredtext", ".md": "markdown"} diff --git a/airbyte-cdk/python/reference_docs/_source/index.rst b/airbyte-cdk/python/reference_docs/_source/index.rst index eb0ede2d2ad77..cc6beef3ede95 100644 --- a/airbyte-cdk/python/reference_docs/_source/index.rst +++ b/airbyte-cdk/python/reference_docs/_source/index.rst @@ -21,6 +21,12 @@ This documentation is autogenerated from source code comments. More extensive ov api/airbyte_cdk.sources +.. toctree:: + :maxdepth: 4 + :caption: Utils + + api/airbyte_cdk.utils + Indices and tables ================== diff --git a/airbyte-cdk/python/reference_docs/generate_rst_schema.py b/airbyte-cdk/python/reference_docs/generate_rst_schema.py index 6a9edb3a72d56..37168aa0d8cd8 100755 --- a/airbyte-cdk/python/reference_docs/generate_rst_schema.py +++ b/airbyte-cdk/python/reference_docs/generate_rst_schema.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import sys diff --git a/airbyte-cdk/python/setup.py b/airbyte-cdk/python/setup.py index baedbfb18d227..ff3a17f010518 100644 --- a/airbyte-cdk/python/setup.py +++ b/airbyte-cdk/python/setup.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -15,7 +15,7 @@ setup( name="airbyte-cdk", - version="0.1.47", + version="0.1.72", description="A framework for writing Airbyte Connectors.", long_description=README, long_description_content_type="text/markdown", @@ -33,8 +33,6 @@ "Topic :: Software Development :: Libraries :: Python Modules", "License :: OSI Approved :: MIT License", # Python Version Support - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", ], keywords="airbyte connector-development-kit cdk", @@ -46,26 +44,32 @@ packages=find_packages(exclude=("unit_tests",)), install_requires=[ "backoff", - "dpath==2.0.1", + "dataclasses-jsonschema~=2.15.1", + "dpath~=2.0.1", "jsonschema~=3.2.0", "jsonref~=0.2", "pendulum", "pydantic~=1.6", "PyYAML~=5.4", "requests", - "sentry-sdk~=1.5.1", "vcrpy", "Deprecated~=1.2", + "Jinja2~=3.1.2", + "jello~=1.5.2", ], - python_requires=">=3.7.0", + python_requires=">=3.9", extras_require={ - "dev": ["MyPy~=0.812", "pytest", "pytest-cov", "pytest-mock", "requests-mock", "pytest-httpserver"], + "dev": [ + "MyPy~=0.812", + "pytest", + "pytest-cov", + "pytest-mock", + "requests-mock", + "pytest-httpserver", + ], "sphinx-docs": [ "Sphinx~=4.2", "sphinx-rtd-theme~=1.0", ], }, - entry_points={ - "console_scripts": ["base-python=base_python.entrypoint:main"], - }, ) diff --git a/airbyte-cdk/python/type_check_and_test.sh b/airbyte-cdk/python/type_check_and_test.sh index 643786b9f5298..93fccd7e1e197 100755 --- a/airbyte-cdk/python/type_check_and_test.sh +++ b/airbyte-cdk/python/type_check_and_test.sh @@ -5,8 +5,7 @@ # Static Type Checking echo "Running MyPy to static check and test files." -# Exclude Singer for the time being. -mypy --exclude '/*singer*/' airbyte_cdk/ unit_tests/ +mypy airbyte_cdk/ unit_tests/ printf "\n" diff --git a/airbyte-cdk/python/unit_tests/destinations/test_destination.py b/airbyte-cdk/python/unit_tests/destinations/test_destination.py index 18b2cc3037c16..66043b0365bb3 100644 --- a/airbyte-cdk/python/unit_tests/destinations/test_destination.py +++ b/airbyte-cdk/python/unit_tests/destinations/test_destination.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import argparse @@ -11,6 +11,7 @@ import pytest from airbyte_cdk.destinations import Destination +from airbyte_cdk.destinations import destination as destination_module from airbyte_cdk.models import ( AirbyteCatalog, AirbyteConnectionStatus, @@ -136,6 +137,13 @@ def __eq__(self, other): class TestRun: + def test_run_initializes_exception_handler(self, mocker, destination: Destination): + mocker.patch.object(destination_module, "init_uncaught_exception_handler") + mocker.patch.object(destination, "parse_args") + mocker.patch.object(destination, "run_cmd") + destination.run(["dummy"]) + destination_module.init_uncaught_exception_handler.assert_called_once_with(destination_module.logger) + def test_run_spec(self, mocker, destination: Destination): args = {"command": "spec"} parsed_args = argparse.Namespace(**args) diff --git a/airbyte-cdk/python/unit_tests/singer/test_singer_helpers.py b/airbyte-cdk/python/unit_tests/singer/test_singer_helpers.py index a732515217c4c..54f783f0b530d 100644 --- a/airbyte-cdk/python/unit_tests/singer/test_singer_helpers.py +++ b/airbyte-cdk/python/unit_tests/singer/test_singer_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/unit_tests/singer/test_singer_source.py b/airbyte-cdk/python/unit_tests/singer/test_singer_source.py index 319745c3e22b6..58e53500595fe 100644 --- a/airbyte-cdk/python/unit_tests/singer/test_singer_source.py +++ b/airbyte-cdk/python/unit_tests/singer/test_singer_source.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py new file mode 100644 index 0000000000000..12cb353de5c04 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_oauth.py @@ -0,0 +1,95 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import logging + +import pendulum +import requests +from airbyte_cdk.sources.declarative.auth import DeclarativeOauth2Authenticator +from requests import Response + +LOGGER = logging.getLogger(__name__) + +resp = Response() + +config = { + "refresh_endpoint": "refresh_end", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "token_expiry_date": pendulum.now().subtract(days=2).to_rfc3339_string(), + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", +} +options = {"refresh_token": "some_refresh_token"} + + +class TestOauth2Authenticator: + """ + Test class for OAuth2Authenticator. + """ + + def test_refresh_request_body(self): + """ + Request body should match given configuration. + """ + scopes = ["scope1", "scope2"] + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ options['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + options=options, + ) + body = oauth.build_refresh_request_body() + expected = { + "grant_type": "refresh_token", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": "some_refresh_token", + "scopes": scopes, + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", + } + assert body == expected + + def test_refresh_access_token(self, mocker): + oauth = DeclarativeOauth2Authenticator( + token_refresh_endpoint="{{ config['refresh_endpoint'] }}", + client_id="{{ config['client_id'] }}", + client_secret="{{ config['client_secret'] }}", + refresh_token="{{ config['refresh_token'] }}", + config=config, + scopes=["scope1", "scope2"], + token_expiry_date="{{ config['token_expiry_date'] }}", + refresh_request_body={ + "custom_field": "{{ config['custom_field'] }}", + "another_field": "{{ config['another_field'] }}", + "scopes": ["no_override"], + }, + options={}, + ) + + resp.status_code = 200 + mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": 1000}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) + token = oauth.refresh_access_token() + + schem = DeclarativeOauth2Authenticator.json_schema() + print(schem) + + assert ("access_token", 1000) == token + + +def mock_request(method, url, data): + if url == "refresh_end": + return resp + raise Exception(f"Error while refreshing access token with request: {method}, {url}, {data}") diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py new file mode 100644 index 0000000000000..29613a73fbe5f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/auth/test_token_auth.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import logging + +import pytest +import requests +from airbyte_cdk.sources.declarative.auth.token import ApiKeyAuthenticator, BasicHttpAuthenticator, BearerAuthenticator +from requests import Response + +LOGGER = logging.getLogger(__name__) + +resp = Response() +config = {"username": "user", "password": "password", "header": "header"} +options = {"username": "user", "password": "password", "header": "header"} + + +@pytest.mark.parametrize( + "test_name, token, expected_header_value", + [ + ("test_static_token", "test-token", "Bearer test-token"), + ("test_token_from_config", "{{ config.username }}", "Bearer user"), + ("test_token_from_options", "{{ options.username }}", "Bearer user"), + ], +) +def test_bearer_token_authenticator(test_name, token, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = BearerAuthenticator(token, config, options=options) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": expected_header_value} == prepared_request.headers + assert {"Authorization": expected_header_value} == header1 + assert {"Authorization": expected_header_value} == header2 + + +@pytest.mark.parametrize( + "test_name, username, password, expected_header_value", + [ + ("test_static_creds", "user", "password", "Basic dXNlcjpwYXNzd29yZA=="), + ("test_creds_from_config", "{{ config.username }}", "{{ config.password }}", "Basic dXNlcjpwYXNzd29yZA=="), + ("test_creds_from_options", "{{ options.username }}", "{{ options.password }}", "Basic dXNlcjpwYXNzd29yZA=="), + ], +) +def test_basic_authenticator(test_name, username, password, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = BasicHttpAuthenticator(username=username, password=password, config=config, options=options) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": expected_header_value} == prepared_request.headers + assert {"Authorization": expected_header_value} == header1 + assert {"Authorization": expected_header_value} == header2 + + +@pytest.mark.parametrize( + "test_name, header, token, expected_header, expected_header_value", + [ + ("test_static_token", "Authorization", "test-token", "Authorization", "test-token"), + ("test_token_from_config", "{{ config.header }}", "{{ config.username }}", "header", "user"), + ("test_token_from_options", "{{ options.header }}", "{{ options.username }}", "header", "user"), + ], +) +def test_api_key_authenticator(test_name, header, token, expected_header, expected_header_value): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = ApiKeyAuthenticator(header=header, api_token=token, config=config, options=options) + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {expected_header: expected_header_value} == prepared_request.headers + assert {expected_header: expected_header_value} == header1 + assert {expected_header: expected_header_value} == header2 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/checks/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py b/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py new file mode 100644 index 0000000000000..827b99ab64842 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/checks/test_check_stream.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.checks.check_stream import CheckStream + +logger = None +config = dict() + +stream_names = ["s1"] +record = MagicMock() + + +@pytest.mark.parametrize( + "test_name, record, streams_to_check, expectation", + [ + ("test success check", record, stream_names, (True, None)), + ("test fail check", None, stream_names, (True, None)), + ("test try to check invalid stream", record, ["invalid_stream_name"], None), + ], +) +def test_check_stream(test_name, record, streams_to_check, expectation): + stream = MagicMock() + stream.name = "s1" + stream.read_records.return_value = iter([record]) + + source = MagicMock() + source.streams.return_value = [stream] + + check_stream = CheckStream(streams_to_check, options={}) + + if expectation: + actual = check_stream.check_connection(source, logger, config) + assert actual == expectation + else: + with pytest.raises(ValueError): + check_stream.check_connection(source, logger, config) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py new file mode 100644 index 0000000000000..f67032c02d580 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/datetime/test_min_max_datetime.py @@ -0,0 +1,96 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime + +date_format = "%Y-%m-%dT%H:%M:%S.%f%z" + +old_date = "2021-01-01T20:12:19.597854Z" +middle_date = "2022-01-01T20:12:19.597854Z" +new_date = "2022-06-24T20:12:19.597854Z" + + +@pytest.mark.parametrize( + "test_name, date, min_date, max_date, expected_date", + [ + ("test_time_is_greater_than_min", "{{ config['older'] }}", "{{ stream_state['newer'] }}", "", new_date), + ("test_time_is_less_than_min", "{{ stream_state['newer'] }}", "{{ config['older'] }}", "", new_date), + ("test_time_is_equal_to_min", "{{ config['older'] }}", "{{ config['older'] }}", "", old_date), + ("test_time_is_greater_than_max", "{{ stream_state['newer'] }}", "", "{{ config['older'] }}", old_date), + ("test_time_is_less_than_max", "{{ config['older'] }}", "", "{{ stream_state['newer'] }}", old_date), + ("test_time_is_equal_to_min", "{{ stream_state['newer'] }}", "{{ stream_state['newer'] }}", "", new_date), + ( + "test_time_is_between_min_and_max", + "{{ config['middle'] }}", + "{{ config['older'] }}", + "{{ stream_state['newer'] }}", + middle_date, + ), + ("test_min_newer_time_from_options", "{{ config['older'] }}", "{{ options['newer'] }}", "", new_date), + ("test_max_newer_time_from_options", "{{ stream_state['newer'] }}", "", "{{ options['older'] }}", old_date), + ], +) +def test_min_max_datetime(test_name, date, min_date, max_date, expected_date): + print(MinMaxDatetime.json_schema()) + config = {"older": old_date, "middle": middle_date} + stream_state = {"newer": new_date} + options = {"newer": new_date, "older": old_date} + + min_max_date = MinMaxDatetime(datetime=date, min_datetime=min_date, max_datetime=max_date, options=options) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime(expected_date, date_format) + + +def test_custom_datetime_format(): + config = {"older": "2021-01-01T20:12:19", "middle": "2022-01-01T20:12:19"} + stream_state = {"newer": "2022-06-24T20:12:19"} + + min_max_date = MinMaxDatetime( + datetime="{{ config['middle'] }}", + datetime_format="%Y-%m-%dT%H:%M:%S", + min_datetime="{{ config['older'] }}", + max_datetime="{{ stream_state['newer'] }}", + options={}, + ) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=datetime.timezone.utc) + + +def test_format_is_a_number(): + config = {"older": "20210101", "middle": "20220101"} + stream_state = {"newer": "20220624"} + + min_max_date = MinMaxDatetime( + datetime="{{ config['middle'] }}", + datetime_format="%Y%m%d", + min_datetime="{{ config['older'] }}", + max_datetime="{{ stream_state['newer'] }}", + options={}, + ) + actual_date = min_max_date.get_datetime(config, **{"stream_state": stream_state}) + + assert actual_date == datetime.datetime.strptime("20220101", "%Y%m%d").replace(tzinfo=datetime.timezone.utc) + + +def test_set_datetime_format(): + min_max_date = MinMaxDatetime(datetime="{{ config['middle'] }}", min_datetime="{{ config['older'] }}", options={}) + + # Retrieve datetime using the default datetime formatting + default_fmt_config = {"older": "2021-01-01T20:12:19.597854Z", "middle": "2022-01-01T20:12:19.597854Z"} + actual_date = min_max_date.get_datetime(default_fmt_config) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19.597854Z", "%Y-%m-%dT%H:%M:%S.%f%z") + + # Set a different datetime format and attempt to retrieve datetime using an updated format + min_max_date.datetime_format = "%Y-%m-%dT%H:%M:%S" + + custom_fmt_config = {"older": "2021-01-01T20:12:19", "middle": "2022-01-01T20:12:19"} + actual_date = min_max_date.get_datetime(custom_fmt_config) + + assert actual_date == datetime.datetime.strptime("2022-01-01T20:12:19", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=datetime.timezone.utc) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_jello.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_jello.py new file mode 100644 index 0000000000000..b9a1ec25322d8 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_jello.py @@ -0,0 +1,54 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor + +config = {"field": "record_array"} +options = {"options_field": "record_array"} + +decoder = JsonDecoder(options={}) + + +@pytest.mark.parametrize( + "test_name, transform, body, expected_records", + [ + ("test_extract_from_array", "_.data", {"data": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + ("test_field_in_config", "_.{{ config['field'] }}", {"record_array": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + ("test_field_in_options", "_.{{ options['options_field'] }}", {"record_array": [{"id": 1}, {"id": 2}]}, [{"id": 1}, {"id": 2}]), + ("test_default", "_{{kwargs['field']}}", [{"id": 1}, {"id": 2}], [{"id": 1}, {"id": 2}]), + ( + "test_remove_fields_from_records", + "[{k:v for k,v in d.items() if k != 'value_to_remove'} for d in _.data]", + {"data": [{"id": 1, "value": "HELLO", "value_to_remove": "fail"}, {"id": 2, "value": "WORLD", "value_to_remove": "fail"}]}, + [{"id": 1, "value": "HELLO"}, {"id": 2, "value": "WORLD"}], + ), + ( + "test_add_fields_from_records", + "[{**{k:v for k,v in d.items()}, **{'project_id': d['project']['id']}} for d in _.data]", + {"data": [{"id": 1, "value": "HELLO", "project": {"id": 8}}, {"id": 2, "value": "WORLD", "project": {"id": 9}}]}, + [ + {"id": 1, "value": "HELLO", "project_id": 8, "project": {"id": 8}}, + {"id": 2, "value": "WORLD", "project_id": 9, "project": {"id": 9}}, + ], + ), + ], +) +def test(test_name, transform, body, expected_records): + extractor = JelloExtractor(transform=transform, config=config, decoder=decoder, options=options) + + response = create_response(body) + actual_records = extractor.extract_records(response) + + assert actual_records == expected_records + + +def create_response(body): + response = requests.Response() + response._content = json.dumps(body).encode("utf-8") + return response diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py new file mode 100644 index 0000000000000..e58db11ada566 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_filter.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter + + +@pytest.mark.parametrize( + "test_name, filter_template, records, expected_records", + [ + ( + "test_using_state_filter", + "{{ record['created_at'] > stream_state['created_at'] }}", + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + [{"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + ), + ( + "test_with_slice_filter", + "{{ record['last_seen'] >= stream_slice['last_seen'] }}", + [{"id": 1, "last_seen": "06-06-21"}, {"id": 2, "last_seen": "06-07-21"}, {"id": 3, "last_seen": "06-10-21"}], + [{"id": 3, "last_seen": "06-10-21"}], + ), + ( + "test_with_next_page_token_filter", + "{{ record['id'] >= next_page_token['last_seen_id'] }}", + [{"id": 11}, {"id": 12}, {"id": 13}, {"id": 14}, {"id": 15}], + [{"id": 14}, {"id": 15}], + ), + ( + "test_missing_filter_fields_return_no_results", + "{{ record['id'] >= next_page_token['path_to_nowhere'] }}", + [{"id": 11}, {"id": 12}, {"id": 13}, {"id": 14}, {"id": 15}], + [], + ), + ( + "test_using_options_filter", + "{{ record['created_at'] > options['created_at'] }}", + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + [{"id": 3, "created_at": "06-08-21"}], + ), + ], +) +def test_record_filter(test_name, filter_template, records, expected_records): + config = {"response_override": "stop_if_you_see_me"} + options = {"created_at": "06-07-21"} + stream_state = {"created_at": "06-06-21"} + stream_slice = {"last_seen": "06-10-21"} + next_page_token = {"last_seen_id": 14} + record_filter = RecordFilter(config=config, condition=filter_template, options=options) + + actual_records = record_filter.filter_records( + records, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + assert actual_records == expected_records diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py new file mode 100644 index 0000000000000..fa2bbfdcd7ce5 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/extractors/test_record_selector.py @@ -0,0 +1,73 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector + + +@pytest.mark.parametrize( + "test_name, transform_template, filter_template, body, expected_records", + [ + ( + "test_with_extractor_and_filter", + "_.data", + "{{ record['created_at'] > stream_state['created_at'] }}", + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}]}, + [{"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}], + ), + ( + "test_no_record_filter_returns_all_records", + "_.data", + None, + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}]}, + [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}], + ), + ( + "test_with_extractor_and_filter_with_options", + "_.{{ options['options_field'] }}", + "{{ record['created_at'] > options['created_at'] }}", + {"data": [{"id": 1, "created_at": "06-06-21"}, {"id": 2, "created_at": "06-07-21"}, {"id": 3, "created_at": "06-08-21"}]}, + [{"id": 3, "created_at": "06-08-21"}], + ), + ( + "test_read_single_record", + "_.data", + None, + {"data": {"id": 1, "created_at": "06-06-21"}}, + [{"id": 1, "created_at": "06-06-21"}], + ), + ], +) +def test_record_filter(test_name, transform_template, filter_template, body, expected_records): + config = {"response_override": "stop_if_you_see_me"} + options = {"options_field": "data", "created_at": "06-07-21"} + stream_state = {"created_at": "06-06-21"} + stream_slice = {"last_seen": "06-10-21"} + next_page_token = {"last_seen_id": 14} + + response = create_response(body) + decoder = JsonDecoder(options={}) + extractor = JelloExtractor(transform=transform_template, decoder=decoder, config=config, options=options) + if filter_template is None: + record_filter = None + else: + record_filter = RecordFilter(config=config, condition=filter_template, options=options) + record_selector = RecordSelector(extractor=extractor, record_filter=record_filter, options=options) + + actual_records = record_selector.select_records( + response=response, stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token + ) + assert actual_records == expected_records + + +def create_response(body): + response = requests.Response() + response._content = json.dumps(body).encode("utf-8") + return response diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py new file mode 100644 index 0000000000000..244d041846cbd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py @@ -0,0 +1,41 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean + +config = { + "parent": {"key_with_true": True}, + "string_key": "compare_me", + "zero_value": 0, + "empty_array": [], + "non_empty_array": [1], + "empty_dict": {}, + "empty_tuple": (), +} + + +@pytest.mark.parametrize( + "test_name, template, expected_result", + [ + ("test_interpolated_true_value", "{{ config['parent']['key_with_true'] }}", True), + ("test_interpolated_true_comparison", "{{ config['string_key'] == \"compare_me\" }}", True), + ("test_interpolated_false_condition", "{{ config['string_key'] == \"witness_me\" }}", False), + ("test_path_has_value_returns_true", "{{ config['string_key'] }}", True), + ("test_missing_key_defaults_to_false", "{{ path_to_nowhere }}", False), + ("test_zero_is_false", "{{ config['zero_value'] }}", False), + ("test_empty_array_is_false", "{{ config['empty_array'] }}", False), + ("test_empty_dict_is_false", "{{ config['empty_dict'] }}", False), + ("test_empty_tuple_is_false", "{{ config['empty_tuple'] }}", False), + ("test_lowercase_false", '{{ "false" }}', False), + ("test_False", "{{ False }}", False), + ("test_True", "{{ True }}", True), + ("test_value_in_array", "{{ 1 in config['non_empty_array'] }}", True), + ("test_value_not_in_array", "{{ 2 in config['non_empty_array'] }}", False), + ("test_interpolation_using_options", "{{ options['from_options'] == \"come_find_me\" }}", True), + ], +) +def test_interpolated_boolean(test_name, template, expected_result): + interpolated_bool = InterpolatedBoolean(condition=template, options={"from_options": "come_find_me"}) + assert interpolated_bool.eval(config) == expected_result diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py new file mode 100644 index 0000000000000..8491cc6b9086b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping + + +@pytest.mark.parametrize( + "test_name, key, expected_value", + [ + ("test_field_value", "field", "value"), + ("test_number", "number", 100), + ("test_field_to_interpolate_from_config", "field_to_interpolate_from_config", "VALUE_FROM_CONFIG"), + ("test_field_to_interpolate_from_kwargs", "field_to_interpolate_from_kwargs", "VALUE_FROM_KWARGS"), + ("test_field_to_interpolate_from_options", "field_to_interpolate_from_options", "VALUE_FROM_OPTIONS"), + ("test_key_is_interpolated", "key", "VALUE"), + ], +) +def test(test_name, key, expected_value): + d = { + "field": "value", + "number": 100, + "field_to_interpolate_from_config": "{{ config['c'] }}", + "field_to_interpolate_from_kwargs": "{{ kwargs['a'] }}", + "field_to_interpolate_from_options": "{{ options['b'] }}", + "{{ options.k }}": "VALUE", + } + config = {"c": "VALUE_FROM_CONFIG"} + kwargs = {"a": "VALUE_FROM_KWARGS"} + mapping = InterpolatedMapping(mapping=d, options={"b": "VALUE_FROM_OPTIONS", "k": "key"}) + + interpolated = mapping.eval(config, **{"kwargs": kwargs}) + + assert interpolated[key] == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py new file mode 100644 index 0000000000000..089174c82f52c --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_interpolated_string.py @@ -0,0 +1,25 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + +config = {"field": "value"} +options = {"hello": "world"} +kwargs = {"c": "airbyte"} + + +@pytest.mark.parametrize( + "test_name, input_string, expected_value", + [ + ("test_static_value", "HELLO WORLD", "HELLO WORLD"), + ("test_eval_from_options", "{{ options['hello'] }}", "world"), + ("test_eval_from_config", "{{ config['field'] }}", "value"), + ("test_eval_from_kwargs", "{{ kwargs['c'] }}", "airbyte"), + ("test_eval_from_kwargs", "{{ kwargs['c'] }}", "airbyte"), + ], +) +def test_interpolated_string(test_name, input_string, expected_value): + s = InterpolatedString.create(input_string, options=options) + assert s.eval(config, **{"kwargs": kwargs}) == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py new file mode 100644 index 0000000000000..62fc8267fad26 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_jinja.py @@ -0,0 +1,80 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime + +import pytest +from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation + +interpolation = JinjaInterpolation() + + +def test_get_value_from_config(): + s = "{{ config['date'] }}" + config = {"date": "2022-01-01"} + val = interpolation.eval(s, config) + assert val == "2022-01-01" + + +def test_get_value_from_stream_slice(): + s = "{{ stream_slice['date'] }}" + config = {"date": "2022-01-01"} + stream_slice = {"date": "2020-09-09"} + val = interpolation.eval(s, config, **{"stream_slice": stream_slice}) + assert val == "2020-09-09" + + +def test_get_value_from_a_list_of_mappings(): + s = "{{ records[0]['date'] }}" + config = {"date": "2022-01-01"} + records = [{"date": "2020-09-09"}] + val = interpolation.eval(s, config, **{"records": records}) + assert val == "2020-09-09" + + +@pytest.mark.parametrize( + "test_name, s, value", + [ + ("test_number", "{{1}}", 1), + ("test_list", "{{[1,2]}}", [1, 2]), + ("test_dict", "{{ {1:2} }}", {1: 2}), + ("test_addition", "{{ 1+2 }}", 3), + ], +) +def test_literals(test_name, s, value): + val = interpolation.eval(s, None) + assert val == value + + +def test_positive_day_delta(): + delta_template = "{{ day_delta(25) }}" + interpolation = JinjaInterpolation() + val = interpolation.eval(delta_template, {}) + + # We need to assert against an earlier delta since the interpolation function runs datetime.now() a few milliseconds earlier + assert val > (datetime.datetime.now(datetime.timezone.utc) + datetime.timedelta(days=24, hours=23)).strftime("%Y-%m-%dT%H:%M:%S.%f%z") + + +def test_negative_day_delta(): + delta_template = "{{ day_delta(-25) }}" + interpolation = JinjaInterpolation() + val = interpolation.eval(delta_template, {}) + + assert val <= (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=25)).strftime("%Y-%m-%dT%H:%M:%S.%f%z") + + +@pytest.mark.parametrize( + "test_name, s, expected_value", + [ + ("test_timestamp_from_timestamp", "{{ timestamp(1621439283) }}", 1621439283), + ("test_timestamp_from_string", "{{ timestamp('2021-05-19') }}", 1621382400), + ("test_timestamp_from_rfc3339", "{{ timestamp('2017-01-01T00:00:00.0Z') }}", 1483228800), + ("test_max", "{{ max(1,2) }}", 2), + ], +) +def test_macros(test_name, s, expected_value): + interpolation = JinjaInterpolation() + config = {} + val = interpolation.eval(s, config) + assert val == expected_value diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py new file mode 100644 index 0000000000000..14266f44ee68a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/interpolation/test_macros.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.interpolation.macros import macros + + +@pytest.mark.parametrize( + "test_name, fn_name, found_in_macros", + [ + ("test_now_local", "now_local", True), + ("test_now_utc", "now_utc", True), + ("test_today_utc", "today_utc", True), + ("test_max", "max", True), + ("test_day_delta", "day_delta", True), + ("test_not_a_macro", "thisisnotavalidmacro", False), + ], +) +def test_macros_export(test_name, fn_name, found_in_macros): + if found_in_macros: + assert fn_name in macros + else: + assert fn_name not in macros diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/iterators/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/iterators/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/iterators/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/iterators/test_only_once.py b/airbyte-cdk/python/unit_tests/sources/declarative/iterators/test_only_once.py new file mode 100644 index 0000000000000..d51ca23b04e37 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/iterators/test_only_once.py @@ -0,0 +1,13 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.stream_slicers.single_slice import SingleSlice + + +def test(): + iterator = SingleSlice(options={}) + + stream_slices = iterator.stream_slices(SyncMode.incremental, None) + assert stream_slices == [dict()] diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_yaml_parser.py b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_yaml_parser.py new file mode 100644 index 0000000000000..bbc9104ab1b7f --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/parsers/test_yaml_parser.py @@ -0,0 +1,144 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.parsers.undefined_reference_exception import UndefinedReferenceException +from airbyte_cdk.sources.declarative.parsers.yaml_parser import YamlParser + +parser = YamlParser() + + +def test(): + content = """ + limit: 50 + """ + config = parser.parse(content) + assert config["limit"] == 50 + + +def test_get_ref(): + s = """ + limit_ref: "*ref(limit)" + """ + ref_key = parser._get_ref_key(s) + assert ref_key == "limit" + + +def test_get_ref_no_ref(): + s = """ + limit: 50 + """ + ref_key = parser._get_ref_key(s) + assert ref_key is None + + +def test_refer(): + content = """ + limit: 50 + limit_ref: "*ref(limit)" + """ + config = parser.parse(content) + assert config["limit_ref"] == 50 + + +def test_refer_to_inner(): + content = """ + dict: + limit: 50 + limit_ref: "*ref(dict.limit)" + """ + config = parser.parse(content) + assert config["limit_ref"] == 50 + + +def test_refer_to_non_existant_struct(): + content = """ + dict: + limit: 50 + limit_ref: "*ref(not_dict)" + """ + with pytest.raises(UndefinedReferenceException): + parser.parse(content) + + +def test_refer_in_dict(): + content = """ + limit: 50 + offset_request_parameters: + offset: "{{ next_page_token['offset'] }}" + limit: "*ref(limit)" + """ + config = parser.parse(content) + assert config["offset_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + assert config["offset_request_parameters"]["limit"] == 50 + + +def test_refer_to_dict(): + content = """ + limit: 50 + offset_request_parameters: + offset: "{{ next_page_token['offset'] }}" + limit: "*ref(limit)" + offset_pagination_request_parameters: + class: InterpolatedRequestParameterProvider + request_parameters: "*ref(offset_request_parameters)" + """ + config = parser.parse(content) + assert config["limit"] == 50 + assert config["offset_request_parameters"]["limit"] == 50 + assert len(config["offset_pagination_request_parameters"]) == 2 + assert config["offset_pagination_request_parameters"]["request_parameters"]["limit"] == 50 + assert config["offset_pagination_request_parameters"]["request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + + +def test_refer_and_overwrite(): + content = """ + limit: 50 + custom_limit: 25 + offset_request_parameters: + offset: "{{ next_page_token['offset'] }}" + limit: "*ref(limit)" + custom_request_parameters: + $ref: "*ref(offset_request_parameters)" + limit: "*ref(custom_limit)" + """ + config = parser.parse(content) + assert config["offset_request_parameters"]["limit"] == 50 + assert config["custom_request_parameters"]["limit"] == 25 + + assert config["offset_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + assert config["custom_request_parameters"]["offset"] == "{{ next_page_token['offset'] }}" + + +def test_collision(): + content = """ +example: + nested: + path: "first one" + more_nested: + value: "found it!" + nested.path: "uh oh" +reference_to_nested_path: + $ref: "*ref(example.nested.path)" +reference_to_nested_nested_value: + $ref: "*ref(example.nested.more_nested.value)" + """ + config = parser.parse(content) + assert config["example"]["nested"]["path"] == "first one" + assert config["example"]["nested.path"] == "uh oh" + assert config["reference_to_nested_path"] == "uh oh" + assert config["example"]["nested"]["more_nested"]["value"] == "found it!" + assert config["reference_to_nested_nested_value"] == "found it!" + + +def test_list(): + content = """ + list: + - "A" + - "B" + elem_ref: "*ref(list[0])" + """ + config = parser.parse(content) + elem_ref = config["elem_ref"] + assert elem_ref == "A" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py new file mode 100644 index 0000000000000..8ee5af1e57ff6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy + +BACKOFF_TIME = 10 + + +@pytest.mark.parametrize( + "test_name, attempt_count, expected_backoff_time", + [ + ("test_exponential_backoff", 1, BACKOFF_TIME), + ("test_exponential_backoff", 2, BACKOFF_TIME), + ], +) +def test_exponential_backoff(test_name, attempt_count, expected_backoff_time): + response_mock = MagicMock() + backoff_strategy = ConstantBackoffStrategy(backoff_time_in_seconds=BACKOFF_TIME) + backoff = backoff_strategy.backoff(response_mock, attempt_count) + assert backoff == expected_backoff_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py new file mode 100644 index 0000000000000..d60a862770afb --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.exponential_backoff_strategy import ( + ExponentialBackoffStrategy, +) + + +@pytest.mark.parametrize( + "test_name, attempt_count, expected_backoff_time", + [ + ("test_exponential_backoff", 1, 10), + ("test_exponential_backoff", 2, 20), + ], +) +def test_exponential_backoff(test_name, attempt_count, expected_backoff_time): + response_mock = MagicMock() + backoff_strategy = ExponentialBackoffStrategy(factor=5) + backoff = backoff_strategy.backoff(response_mock, attempt_count) + assert backoff == expected_backoff_time + + +def test_exponential_backoff_default(): + response_mock = MagicMock() + backoff_strategy = ExponentialBackoffStrategy() + backoff = backoff_strategy.backoff(response_mock, 3) + assert backoff == 40 diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py new file mode 100644 index 0000000000000..8142322118916 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py @@ -0,0 +1,38 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import re +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.header_helper import get_numeric_value_from_header + + +@pytest.mark.parametrize( + "test_name, headers, requested_header, regex, expected_value", + [ + ("test_get_numeric_value_from_header", {"header": 1}, "header", None, 1), + ("test_get_numeric_value_float_from_header", {"header": 1.2}, "header", None, 1.2), + ("test_get_numeric_value_from_string_value", {"header": "10.9"}, "header", None, 10.9), + ("test_get_numeric_value_from_non_numeric", {"header": "60,120"}, "header", None, None), + ("test_get_numeric_value_from_missing_header", {"header": 1}, "notheader", None, None), + ("test_get_numeric_value_with_regex", {"header": "61,60"}, "header", re.compile("([-+]?\d+)"), 61), # noqa + ("test_get_numeric_value_with_regex_no_header", {"header": "61,60"}, "notheader", re.compile("([-+]?\d+)"), None), # noqa + ("test_get_numeric_value_with_regex_not_matching", {"header": "abc61,60"}, "header", re.compile("([-+]?\d+)"), None), # noqa + ], +) +def test_get_numeric_value_from_header(test_name, headers, requested_header, regex, expected_value): + response_mock = create_response(headers=headers) + numeric_value = get_numeric_value_from_header(response_mock, requested_header, regex) + assert numeric_value == expected_value + + +def create_response(headers=None, json_body=None): + url = "https://airbyte.io" + + response_mock = MagicMock() + response_mock.url = url + response_mock.headers = headers or {} + response_mock.json.return_value = json_body or {} + return response_mock diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py new file mode 100644 index 0000000000000..915a13762d282 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py @@ -0,0 +1,31 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_time_from_header_backoff_strategy import ( + WaitTimeFromHeaderBackoffStrategy, +) + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, header, header_value, regex, expected_backoff_time", + [ + ("test_wait_time_from_header", "wait_time", SOME_BACKOFF_TIME, None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_string", "wait_time", "60", None, SOME_BACKOFF_TIME), + ("test_wait_time_from_header_not_a_number", "wait_time", "61,60", None, None), + ("test_wait_time_from_header_with_regex", "wait_time", "61,60", "([-+]?\d+)", 61), # noqa + ("test_wait_time_fœrom_header_with_regex_no_match", "wait_time", "...", "[-+]?\d+", None), # noqa + ("test_wait_time_from_header", "absent_header", None, None, None), + ], +) +def test_wait_time_from_header(test_name, header, header_value, regex, expected_backoff_time): + response_mock = MagicMock() + response_mock.headers = {"wait_time": header_value} + backoff_stratery = WaitTimeFromHeaderBackoffStrategy(header, regex) + backoff = backoff_stratery.backoff(response_mock, 1) + assert backoff == expected_backoff_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py new file mode 100644 index 0000000000000..713ee4c4896fe --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py @@ -0,0 +1,36 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +from unittest.mock import MagicMock, patch + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.wait_until_time_from_header_backoff_strategy import ( + WaitUntilTimeFromHeaderBackoffStrategy, +) + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, header, wait_until, min_wait, regex, expected_backoff_time", + [ + ("test_wait_until_time_from_header", "wait_until", 1600000060.0, None, None, 60), + ("test_wait_until_negative_time", "wait_until", 1500000000.0, None, None, None), + ("test_wait_until_time_less_than_min", "wait_until", 1600000060.0, 120, None, 120), + ("test_wait_until_no_header", "absent_header", 1600000000.0, None, None, None), + ("test_wait_until_time_from_header_not_numeric", "wait_until", "1600000000,1600000000", None, None, None), + ("test_wait_until_time_from_header_is_numeric", "wait_until", "1600000060", None, None, 60), + ("test_wait_until_time_from_header_with_regex", "wait_until", "1600000060,60", None, "[-+]?\d+", 60), # noqa + ("test_wait_until_time_from_header_with_regex_no_match", "wait_time", "...", None, "[-+]?\d+", None), # noqa + ("test_wait_until_no_header_with_min", "absent_header", "1600000000.0", SOME_BACKOFF_TIME, None, SOME_BACKOFF_TIME), + ], +) +@patch("time.time", return_value=1600000000.0) +def test_wait_untiltime_from_header(time_mock, test_name, header, wait_until, min_wait, regex, expected_backoff_time): + response_mock = MagicMock() + response_mock.headers = {"wait_until": wait_until} + backoff_stratery = WaitUntilTimeFromHeaderBackoffStrategy(header, min_wait, regex) + backoff = backoff_stratery.backoff(response_mock, 1) + assert backoff == expected_backoff_time diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py new file mode 100644 index 0000000000000..27b47f97368cc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py @@ -0,0 +1,110 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import airbyte_cdk.sources.declarative.requesters.error_handlers.response_status as response_status +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import ResponseStatus + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, first_handler_behavior, second_handler_behavior, expected_behavior", + [ + ( + "test_chain_retrier_ok_ok", + response_status.SUCCESS, + response_status.SUCCESS, + response_status.SUCCESS, + ), + ( + "test_chain_retrier_ignore_fail", + response_status.IGNORE, + response_status.FAIL, + response_status.IGNORE, + ), + ( + "test_chain_retrier_fail_ignore", + response_status.FAIL, + response_status.IGNORE, + response_status.IGNORE, + ), + ( + "test_chain_retrier_ignore_retry", + response_status.IGNORE, + ResponseStatus.retry(SOME_BACKOFF_TIME), + response_status.IGNORE, + ), + ( + "test_chain_retrier_retry_ignore", + ResponseStatus.retry(SOME_BACKOFF_TIME), + response_status.IGNORE, + ResponseStatus.retry(SOME_BACKOFF_TIME), + ), + ( + "test_chain_retrier_retry_fail", + ResponseStatus.retry(SOME_BACKOFF_TIME), + response_status.FAIL, + ResponseStatus.retry(SOME_BACKOFF_TIME), + ), + ( + "test_chain_retrier_fail_retry", + response_status.FAIL, + ResponseStatus.retry(SOME_BACKOFF_TIME), + ResponseStatus.retry(SOME_BACKOFF_TIME), + ), + ( + "test_chain_retrier_ignore_ok", + response_status.IGNORE, + response_status.SUCCESS, + response_status.IGNORE, + ), + ( + "test_chain_retrier_ok_ignore", + response_status.SUCCESS, + response_status.IGNORE, + response_status.SUCCESS, + ), + ( + "test_chain_retrier_ok_retry", + response_status.SUCCESS, + ResponseStatus.retry(SOME_BACKOFF_TIME), + response_status.SUCCESS, + ), + ( + "test_chain_retrier_retry_ok", + ResponseStatus.retry(SOME_BACKOFF_TIME), + response_status.SUCCESS, + ResponseStatus.retry(SOME_BACKOFF_TIME), + ), + ( + "test_chain_retrier_return_first_retry", + ResponseStatus.retry(SOME_BACKOFF_TIME), + ResponseStatus.retry(2 * SOME_BACKOFF_TIME), + ResponseStatus.retry(SOME_BACKOFF_TIME), + ), + ], +) +def test_composite_error_handler(test_name, first_handler_behavior, second_handler_behavior, expected_behavior): + first_error_handler = MagicMock() + first_error_handler.should_retry.return_value = first_handler_behavior + second_error_handler = MagicMock() + second_error_handler.should_retry.return_value = second_handler_behavior + second_error_handler.should_retry.return_value = second_handler_behavior + retriers = [first_error_handler, second_error_handler] + retrier = CompositeErrorHandler(error_handlers=retriers, options={}) + response_mock = MagicMock() + response_mock.ok = first_handler_behavior == response_status.SUCCESS or second_handler_behavior == response_status.SUCCESS + assert retrier.should_retry(response_mock) == expected_behavior + + +def test_composite_error_handler_no_handlers(): + try: + CompositeErrorHandler(error_handlers=[], options={}) + assert False + except ValueError: + pass diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py new file mode 100644 index 0000000000000..eca5e4a71bd89 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py @@ -0,0 +1,175 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from http import HTTPStatus +from unittest.mock import MagicMock + +import airbyte_cdk.sources.declarative.requesters.error_handlers.response_status as response_status +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.backoff_strategies.constant_backoff_strategy import ConstantBackoffStrategy +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import ( + DefaultErrorHandler, + HttpResponseFilter, + ResponseStatus, +) +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction + +SOME_BACKOFF_TIME = 60 + + +@pytest.mark.parametrize( + "test_name, http_code, retry_response_filter, ignore_response_filter, response_headers, should_retry, backoff_strategy", + [ + ("test_bad_gateway", HTTPStatus.BAD_GATEWAY, None, None, {}, ResponseStatus.retry(10), None), + ("test_too_many_requests", HTTPStatus.TOO_MANY_REQUESTS, None, None, {}, ResponseStatus.retry(10), None), + ( + "test_bad_gateway_constant_retry", + HTTPStatus.BAD_GATEWAY, + None, + None, + {}, + ResponseStatus.retry(SOME_BACKOFF_TIME), + [ConstantBackoffStrategy(SOME_BACKOFF_TIME)], + ), + ("test_exponential_backoff", HTTPStatus.BAD_GATEWAY, None, None, {}, ResponseStatus.retry(10), None), + ( + "test_bad_gateway_exponential_backoff_explicit_parameter", + HTTPStatus.BAD_GATEWAY, + None, + None, + {}, + ResponseStatus.retry(10), + [DefaultErrorHandler.DEFAULT_BACKOFF_STRATEGY()], + ), + ("test_chain_backoff_strategy", HTTPStatus.BAD_GATEWAY, None, None, {}, ResponseStatus.retry(10), None), + ( + "test_bad_gateway_chain_backoff", + HTTPStatus.BAD_GATEWAY, + None, + None, + {}, + ResponseStatus.retry(10), + [DefaultErrorHandler.DEFAULT_BACKOFF_STRATEGY(), ConstantBackoffStrategy(SOME_BACKOFF_TIME)], + ), + ("test_200", HTTPStatus.OK, None, None, {}, response_status.SUCCESS, None), + ("test_3XX", HTTPStatus.PERMANENT_REDIRECT, None, None, {}, response_status.SUCCESS, None), + ("test_403", HTTPStatus.FORBIDDEN, None, None, {}, response_status.FAIL, None), + ( + "test_403_ignore_error_message", + HTTPStatus.FORBIDDEN, + None, + HttpResponseFilter(action=ResponseAction.IGNORE, error_message_contains="found", options={}), + {}, + response_status.IGNORE, + None, + ), + ( + "test_403_dont_ignore_error_message", + HTTPStatus.FORBIDDEN, + None, + HttpResponseFilter(action=ResponseAction.IGNORE, error_message_contains="not_found", options={}), + {}, + response_status.FAIL, + None, + ), + ("test_429", HTTPStatus.TOO_MANY_REQUESTS, None, None, {}, ResponseStatus.retry(10), None), + ( + "test_ignore_403", + HTTPStatus.FORBIDDEN, + None, + HttpResponseFilter(action=ResponseAction.IGNORE, http_codes={HTTPStatus.FORBIDDEN}, options={}), + {}, + response_status.IGNORE, + None, + ), + ( + "test_403_with_predicate", + HTTPStatus.FORBIDDEN, + HttpResponseFilter(action=ResponseAction.RETRY, predicate="{{ 'code' in response }}", options={}), + None, + {}, + ResponseStatus.retry(10), + None, + ), + ( + "test_403_with_predicate", + HTTPStatus.FORBIDDEN, + HttpResponseFilter(action=ResponseAction.RETRY, predicate="{{ 'some_absent_field' in response }}", options={}), + None, + {}, + response_status.FAIL, + None, + ), + ( + "test_200_fail_with_predicate", + HTTPStatus.OK, + HttpResponseFilter(action=ResponseAction.FAIL, error_message_contains="found", options={}), + None, + {}, + response_status.FAIL, + None, + ), + ( + "test_retry_403", + HTTPStatus.FORBIDDEN, + HttpResponseFilter(action=ResponseAction.RETRY, http_codes={HTTPStatus.FORBIDDEN}, options={}), + None, + {}, + ResponseStatus.retry(10), + None, + ), + ( + "test_200_fail_with_predicate_from_header", + HTTPStatus.OK, + HttpResponseFilter(action=ResponseAction.FAIL, predicate="{{ headers['fail'] }}", options={}), + None, + {"fail": True}, + response_status.FAIL, + None, + ), + ], +) +def test_default_error_handler( + test_name, http_code, retry_response_filter, ignore_response_filter, response_headers, should_retry, backoff_strategy +): + response_mock = create_response(http_code, headers=response_headers, json_body={"code": "1000", "error": "found"}) + response_mock.ok = http_code < 400 + response_filters = [f for f in [retry_response_filter, ignore_response_filter] if f] + error_handler = DefaultErrorHandler(response_filters=response_filters, backoff_strategies=backoff_strategy, options={}) + actual_should_retry = error_handler.should_retry(response_mock) + assert actual_should_retry == should_retry + if should_retry.action == ResponseAction.RETRY: + assert actual_should_retry.retry_in == should_retry.retry_in + + +def test_default_error_handler_attempt_count_increases(): + status_code = 500 + response_mock = create_response(status_code) + error_handler = DefaultErrorHandler(options={}) + actual_should_retry = error_handler.should_retry(response_mock) + assert actual_should_retry == ResponseStatus.retry(10) + assert actual_should_retry.retry_in == 10 + + # This is the same request, so the count should increase + actual_should_retry = error_handler.should_retry(response_mock) + assert actual_should_retry == ResponseStatus.retry(20) + assert actual_should_retry.retry_in == 20 + + # This is a different request, so the count should not increase + another_identical_request = create_response(status_code) + actual_should_retry = error_handler.should_retry(another_identical_request) + assert actual_should_retry == ResponseStatus.retry(10) + assert actual_should_retry.retry_in == 10 + + +def create_response(status_code: int, headers=None, json_body=None): + url = "https://airbyte.io" + + response_mock = MagicMock() + response_mock.status_code = status_code + response_mock.ok = status_code < 400 or status_code >= 600 + response_mock.url = url + response_mock.headers = headers or {} + response_mock.json.return_value = json_body or {} + return response_mock diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py new file mode 100644 index 0000000000000..8e51abb1a7be3 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py @@ -0,0 +1,32 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus + + +@pytest.mark.parametrize( + "test_name, response_action, retry_in, expected_action, expected_backoff", + [ + ("test_fail_with_backoff", ResponseAction.FAIL, 10, None, None), + ("test_success_no_backoff", ResponseAction.FAIL, None, ResponseAction.FAIL, None), + ("test_ignore_with_backoff", ResponseAction.IGNORE, 10, None, None), + ("test_success_no_backoff", ResponseAction.IGNORE, None, ResponseAction.IGNORE, None), + ("test_success_with_backoff", ResponseAction.SUCCESS, 10, None, None), + ("test_success_no_backoff", ResponseAction.SUCCESS, None, ResponseAction.SUCCESS, None), + ("test_retry_with_backoff", ResponseAction.RETRY, 10, ResponseAction.RETRY, 10), + ("test_retry_no_backoff", ResponseAction.RETRY, None, ResponseAction.RETRY, None), + ], +) +def test_response_status(test_name, response_action, retry_in, expected_action, expected_backoff): + if expected_action or expected_backoff: + response_status = ResponseStatus(response_action, retry_in) + assert response_status.action == expected_action and response_status.retry_in == expected_backoff + else: + try: + ResponseStatus(response_action, retry_in) + assert False + except ValueError: + pass diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py new file mode 100644 index 0000000000000..2f3600e03cd2d --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy + + +@pytest.mark.parametrize( + "test_name, template_string, stop_condition, expected_token", + [ + ("test_static_token", "token", None, "token"), + ("test_token_from_config", "{{ config.config_key }}", None, "config_value"), + ("test_token_from_last_record", "{{ last_records[-1].id }}", None, 1), + ("test_token_from_response", "{{ response._metadata.content }}", None, "content_value"), + ("test_token_from_options", "{{ options.key }}", None, "value"), + ("test_token_not_found", "{{ response.invalid_key }}", None, None), + ("test_static_token_with_stop_condition_false", "token", InterpolatedBoolean("{{False}}", options={}), "token"), + ("test_static_token_with_stop_condition_true", "token", InterpolatedBoolean("{{True}}", options={}), None), + ("test_token_from_header", "{{ headers.next }}", InterpolatedBoolean("{{ not headers.has_more }}", options={}), "ready_to_go"), + ( + "test_token_from_response_header_links", + "{{ headers.link.next.url }}", + InterpolatedBoolean("{{ not headers.link.next.url }}", options={}), + "https://adventure.io/api/v1/records?page=2&per_page=100", + ), + ], +) +def test_cursor_pagination_strategy(test_name, template_string, stop_condition, expected_token): + decoder = JsonDecoder(options={}) + config = {"config_key": "config_value"} + options = {"key": "value"} + strategy = CursorPaginationStrategy( + cursor_value=template_string, config=config, stop_condition=stop_condition, decoder=decoder, options=options + ) + + response = requests.Response() + link_str = '; rel="next"' + response.headers = {"has_more": True, "next": "ready_to_go", "link": link_str} + response_body = {"_metadata": {"content": "content_value"}, "accounts": [], "end": 99, "total": 200, "characters": {}} + response._content = json.dumps(response_body).encode("utf-8") + last_records = [{"id": 0, "more_records": True}, {"id": 1, "more_records": True}] + + token = strategy.next_page_token(response, last_records) + assert expected_token == token diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_limit_paginator.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_limit_paginator.py new file mode 100644 index 0000000000000..26d55a0276eea --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_limit_paginator.py @@ -0,0 +1,172 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json +from unittest.mock import MagicMock + +import pytest +import requests +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean +from airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator import LimitPaginator, RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.cursor_pagination_strategy import CursorPaginationStrategy + + +@pytest.mark.parametrize( + "test_name, page_token_request_option, stop_condition, expected_updated_path, expected_request_params, expected_headers, expected_body_data, expected_body_json, last_records, expected_next_page_token", + [ + ( + "test_limit_paginator_path", + RequestOption(inject_into=RequestOptionType.path, options={}), + None, + "/next_url", + {"limit": 2}, + {}, + {}, + {}, + [{"id": 0}, {"id": 1}], + {"next_page_token": "https://airbyte.io/next_url"}, + ), + ( + "test_limit_paginator_request_param", + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="from", options={}), + None, + None, + {"limit": 2, "from": "https://airbyte.io/next_url"}, + {}, + {}, + {}, + [{"id": 0}, {"id": 1}], + {"next_page_token": "https://airbyte.io/next_url"}, + ), + ( + "test_limit_paginator_no_token", + RequestOption(inject_into=RequestOptionType.request_parameter, field_name="from", options={}), + InterpolatedBoolean(condition="{{True}}", options={}), + None, + {"limit": 2}, + {}, + {}, + {}, + [{"id": 0}, {"id": 1}], + None, + ), + ( + "test_limit_paginator_cursor_header", + RequestOption(inject_into=RequestOptionType.header, field_name="from", options={}), + None, + None, + {"limit": 2}, + {"from": "https://airbyte.io/next_url"}, + {}, + {}, + [{"id": 0}, {"id": 1}], + {"next_page_token": "https://airbyte.io/next_url"}, + ), + ( + "test_limit_paginator_cursor_body_data", + RequestOption(inject_into=RequestOptionType.body_data, field_name="from", options={}), + None, + None, + {"limit": 2}, + {}, + {"from": "https://airbyte.io/next_url"}, + {}, + [{"id": 0}, {"id": 1}], + {"next_page_token": "https://airbyte.io/next_url"}, + ), + ( + "test_limit_paginator_cursor_body_json", + RequestOption(inject_into=RequestOptionType.body_json, field_name="from", options={}), + None, + None, + {"limit": 2}, + {}, + {}, + {"from": "https://airbyte.io/next_url"}, + [{"id": 0}, {"id": 1}], + {"next_page_token": "https://airbyte.io/next_url"}, + ), + ], +) +def test_limit_paginator( + test_name, + page_token_request_option, + stop_condition, + expected_updated_path, + expected_request_params, + expected_headers, + expected_body_data, + expected_body_json, + last_records, + expected_next_page_token, +): + limit_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="limit", options={}) + cursor_value = "{{ response.next }}" + url_base = "https://airbyte.io" + config = {} + options = {} + strategy = CursorPaginationStrategy( + cursor_value=cursor_value, stop_condition=stop_condition, decoder=JsonDecoder(options={}), config=config, options=options + ) + paginator = LimitPaginator( + page_size=2, + limit_option=limit_request_option, + page_token_option=page_token_request_option, + pagination_strategy=strategy, + config=config, + url_base=url_base, + options={}, + ) + + response = requests.Response() + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url"} + response._content = json.dumps(response_body).encode("utf-8") + + actual_next_page_token = paginator.next_page_token(response, last_records) + actual_next_path = paginator.path() + actual_request_params = paginator.get_request_params() + actual_headers = paginator.get_request_headers() + actual_body_data = paginator.get_request_body_data() + actual_body_json = paginator.get_request_body_json() + assert actual_next_page_token == expected_next_page_token + assert actual_next_path == expected_updated_path + assert actual_request_params == expected_request_params + assert actual_headers == expected_headers + assert actual_body_data == expected_body_data + assert actual_body_json == expected_body_json + + +def test_limit_cannot_be_set_in_path(): + limit_request_option = RequestOption(inject_into=RequestOptionType.path, options={}) + page_token_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="offset", options={}) + cursor_value = "{{ response.next }}" + url_base = "https://airbyte.io" + config = {} + options = {} + strategy = CursorPaginationStrategy(cursor_value=cursor_value, config=config, options=options) + try: + LimitPaginator( + page_size=2, + limit_option=limit_request_option, + page_token_option=page_token_request_option, + pagination_strategy=strategy, + config=config, + url_base=url_base, + options={}, + ) + assert False + except ValueError: + pass + + +def test_reset(): + limit_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="limit", options={}) + page_token_request_option = RequestOption(inject_into=RequestOptionType.request_parameter, field_name="offset", options={}) + url_base = "https://airbyte.io" + config = {} + strategy = MagicMock() + LimitPaginator(2, limit_request_option, page_token_request_option, strategy, config, url_base, options={}).reset() + assert strategy.reset.called diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py new file mode 100644 index 0000000000000..637bebb8f910e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py @@ -0,0 +1,12 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination + + +def test(): + paginator = NoPagination(options={}) + next_page_token = paginator.next_page_token(requests.Response(), []) + assert next_page_token == {} diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py new file mode 100644 index 0000000000000..c8f11a76ad606 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.offset_increment import OffsetIncrement + + +@pytest.mark.parametrize( + "test_name, page_size, expected_next_page_token, expected_offset", + [ + ("test_same_page_size", 2, 2, 2), + ("test_larger_page_size", 3, None, 0), + ], +) +def test_offset_increment_paginator_strategy(test_name, page_size, expected_next_page_token, expected_offset): + paginator_strategy = OffsetIncrement(page_size, options={}) + assert paginator_strategy._offset == 0 + + response = requests.Response() + + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url"} + response._content = json.dumps(response_body).encode("utf-8") + last_records = [{"id": 0}, {"id": 1}] + + next_page_token = paginator_strategy.next_page_token(response, last_records) + assert expected_next_page_token == next_page_token + assert expected_offset == paginator_strategy._offset + + paginator_strategy.reset() + assert 0 == paginator_strategy._offset diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py new file mode 100644 index 0000000000000..9d85cf8298b9e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_page_increment.py @@ -0,0 +1,35 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +import requests +from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement + + +@pytest.mark.parametrize( + "test_name, page_size, expected_next_page_token, expected_offset", + [ + ("test_same_page_size", 2, 1, 1), + ("test_larger_page_size", 3, None, 0), + ], +) +def test_page_increment_paginator_strategy(test_name, page_size, expected_next_page_token, expected_offset): + paginator_strategy = PageIncrement(page_size, options={}) + assert paginator_strategy._page == 0 + + response = requests.Response() + + response.headers = {"A_HEADER": "HEADER_VALUE"} + response_body = {"next": "https://airbyte.io/next_url"} + response._content = json.dumps(response_body).encode("utf-8") + last_records = [{"id": 0}, {"id": 1}] + + next_page_token = paginator_strategy.next_page_token(response, last_records) + assert expected_next_page_token == next_page_token + assert expected_offset == paginator_strategy._page + + paginator_strategy.reset() + assert 0 == paginator_strategy._page diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py new file mode 100644 index 0000000000000..c54be6223be82 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/paginators/test_request_option.py @@ -0,0 +1,33 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType + + +@pytest.mark.parametrize( + "test_name, option_type, field_name, should_raise", + [ + ("test_limit_path_no_field_name", RequestOptionType.path, None, False), + ("test_limit_path_with_field_name", RequestOptionType.path, "field", True), + ("test_limit_param_no_field_name", RequestOptionType.request_parameter, None, True), + ("test_limit_param_with_field_name", RequestOptionType.request_parameter, "field", False), + ("test_limit_header_no_field_name", RequestOptionType.header, None, True), + ("test_limit_header_with_field_name", RequestOptionType.header, "field", False), + ("test_limit_data_no_field_name", RequestOptionType.body_data, None, True), + ("test_limit_data_with_field_name", RequestOptionType.body_data, "field", False), + ("test_limit_json_no_field_name", RequestOptionType.body_json, None, True), + ("test_limit_json_with_field_name", RequestOptionType.body_json, "field", False), + ], +) +def test_request_option(test_name, option_type, field_name, should_raise): + try: + request_option = RequestOption(inject_into=option_type, field_name=field_name, options={}) + if should_raise: + assert False + assert request_option.field_name == field_name + assert request_option.inject_into == option_type + except ValueError: + if not should_raise: + assert False diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py new file mode 100644 index 0000000000000..1100c1c58cf51 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py new file mode 100644 index 0000000000000..457ddc9a22d8b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py @@ -0,0 +1,86 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) + +state = {"date": "2021-01-01"} +stream_slice = {"start_date": "2020-01-01"} +next_page_token = {"offset": 12345, "page": 27} +config = {"option": "OPTION"} + + +@pytest.mark.parametrize( + "test_name, input_request_params, expected_request_params", + [ + ("test_static_param", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_value_depends_on_state", {"read_from_state": "{{ stream_state['date'] }}"}, {"read_from_state": "2021-01-01"}), + ("test_value_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_value_depends_on_next_page_token", {"read_from_token": "{{ next_page_token['offset'] }}"}, {"read_from_token": 12345}), + ("test_value_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ("test_none_value", {"missing_param": "{{ fake_path['date'] }}"}, {}), + ( + "test_parameter_is_interpolated", + {"{{ stream_state['date'] }} - {{stream_slice['start_date']}} - {{next_page_token['offset']}} - {{config['option']}}": "ABC"}, + {"2021-01-01 - 2020-01-01 - 12345 - OPTION": "ABC"}, + ), + ], +) +def test_interpolated_request_params(test_name, input_request_params, expected_request_params): + provider = InterpolatedRequestOptionsProvider(config=config, request_parameters=input_request_params, options={}) + + actual_request_params = provider.get_request_params(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_params == expected_request_params + + +@pytest.mark.parametrize( + "test_name, input_request_json, expected_request_json", + [ + ("test_static_json", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_value_depends_on_state", {"read_from_state": "{{ stream_state['date'] }}"}, {"read_from_state": "2021-01-01"}), + ("test_value_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_value_depends_on_next_page_token", {"read_from_token": "{{ next_page_token['offset'] }}"}, {"read_from_token": 12345}), + ("test_value_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ("test_none_value", {"missing_json": "{{ fake_path['date'] }}"}, {}), + ( + "test_interpolated_keys", + {"{{ stream_state['date'] }}": 123, "{{ config['option'] }}": "ABC"}, + {"2021-01-01": 123, "OPTION": "ABC"}, + ), + ], +) +def test_interpolated_request_json(test_name, input_request_json, expected_request_json): + provider = InterpolatedRequestOptionsProvider(config=config, request_body_json=input_request_json, options={}) + + actual_request_json = provider.get_request_body_json(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_json == expected_request_json + + +@pytest.mark.parametrize( + "test_name, input_request_data, expected_request_data", + [ + ("test_static_map_data", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_map_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['start_date'] }}"}, {"read_from_slice": "2020-01-01"}), + ("test_map_depends_on_config", {"read_from_config": "{{ config['option'] }}"}, {"read_from_config": "OPTION"}), + ("test_defaults_to_empty_dict", None, {}), + ("test_interpolated_keys", {"{{ stream_state['date'] }} - {{ next_page_token['offset'] }}": "ABC"}, {"2021-01-01 - 12345": "ABC"}), + ], +) +def test_interpolated_request_data(test_name, input_request_data, expected_request_data): + provider = InterpolatedRequestOptionsProvider(config=config, request_body_data=input_request_data, options={}) + + actual_request_data = provider.get_request_body_data(stream_state=state, stream_slice=stream_slice, next_page_token=next_page_token) + + assert actual_request_data == expected_request_data + + +def test_error_on_create_for_both_request_json_and_data(): + request_json = {"body_key": "{{ stream_slice['start_date'] }}"} + request_data = "interpolate_me=5&invalid={{ config['option'] }}" + with pytest.raises(ValueError): + InterpolatedRequestOptionsProvider(config=config, request_body_json=request_json, request_body_data=request_data, options={}) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py new file mode 100644 index 0000000000000..0a6c6b3d72c1b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_http_requester.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock + +import requests +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpMethod, HttpRequester + + +def test_http_requester(): + http_method = "GET" + + request_options_provider = MagicMock() + request_params = {"param": "value"} + request_body_data = "body_key_1=value_1&body_key_2=value2" + request_body_json = {"body_field": "body_value"} + request_options_provider.get_request_params.return_value = request_params + request_options_provider.get_request_body_data.return_value = request_body_data + request_options_provider.get_request_body_json.return_value = request_body_json + + request_headers_provider = MagicMock() + request_headers = {"header": "value"} + request_headers_provider.get_request_headers.return_value = request_headers + + authenticator = MagicMock() + + error_handler = MagicMock() + max_retries = 10 + should_retry = True + backoff_time = 1000 + error_handler.max_retries = max_retries + error_handler.should_retry.return_value = should_retry + error_handler.backoff_time.return_value = backoff_time + + config = {"url": "https://airbyte.io"} + stream_slice = {"id": "1234"} + + name = "stream_name" + + requester = HttpRequester( + name=name, + url_base=InterpolatedString.create("{{ config['url'] }}", options={}), + path=InterpolatedString.create("v1/{{ stream_slice['id'] }}", options={}), + http_method=http_method, + request_options_provider=request_options_provider, + authenticator=authenticator, + error_handler=error_handler, + config=config, + options={}, + ) + + assert requester.get_url_base() == "https://airbyte.io" + assert requester.get_path(stream_state={}, stream_slice=stream_slice, next_page_token={}) == "v1/1234" + assert requester.get_authenticator() == authenticator + assert requester.get_method() == HttpMethod.GET + assert requester.get_request_params(stream_state={}, stream_slice=None, next_page_token=None) == request_params + assert requester.get_request_body_data(stream_state={}, stream_slice=None, next_page_token=None) == request_body_data + assert requester.get_request_body_json(stream_state={}, stream_slice=None, next_page_token=None) == request_body_json + assert requester.should_retry(requests.Response()) == should_retry + assert {} == requester.request_kwargs(stream_state={}, stream_slice=None, next_page_token=None) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py new file mode 100644 index 0000000000000..74ee47267c35e --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py @@ -0,0 +1,27 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_input_provider import InterpolatedRequestInputProvider + + +@pytest.mark.parametrize( + "test_name, input_request_data, expected_request_data", + [ + ("test_static_map_data", {"a_static_request_param": "a_static_value"}, {"a_static_request_param": "a_static_value"}), + ("test_map_depends_on_stream_slice", {"read_from_slice": "{{ stream_slice['slice_key'] }}"}, {"read_from_slice": "slice_value"}), + ("test_map_depends_on_config", {"read_from_config": "{{ config['config_key'] }}"}, {"read_from_config": "value_of_config"}), + ("test_defaults_to_empty_dictionary", None, {}), + ], +) +def test_initialize_interpolated_mapping_request_input_provider(test_name, input_request_data, expected_request_data): + config = {"config_key": "value_of_config"} + stream_slice = {"slice_key": "slice_value"} + + provider = InterpolatedRequestInputProvider(config=config, request_inputs=input_request_data) + actual_request_data = provider.request_inputs(stream_state={}, stream_slice=stream_slice) + + assert isinstance(provider._interpolator, InterpolatedMapping) + assert actual_request_data == expected_request_data diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py new file mode 100644 index 0000000000000..6639aa6ec8073 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/retrievers/test_simple_retriever.py @@ -0,0 +1,343 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest.mock import MagicMock, patch + +import airbyte_cdk.sources.declarative.requesters.error_handlers.response_status as response_status +import pytest +import requests +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.read_exception import ReadException +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_action import ResponseAction +from airbyte_cdk.sources.declarative.requesters.error_handlers.response_status import ResponseStatus +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOptionType +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.streams.http.auth import NoAuth +from airbyte_cdk.sources.streams.http.http import HttpStream + +primary_key = "pk" +records = [{"id": 1}, {"id": 2}] +config = {} + + +@patch.object(HttpStream, "read_records", return_value=[]) +def test_simple_retriever_full(mock_http_stream): + requester = MagicMock() + request_params = {"param": "value"} + requester.get_request_params.return_value = request_params + + paginator = MagicMock() + next_page_token = {"cursor": "cursor_value"} + paginator.path.return_value = None + paginator.next_page_token.return_value = next_page_token + + record_selector = MagicMock() + record_selector.select_records.return_value = records + + iterator = MagicMock() + stream_slices = [{"date": "2022-01-01"}, {"date": "2022-01-02"}] + iterator.stream_slices.return_value = stream_slices + + response = requests.Response() + + underlying_state = {"date": "2021-01-01"} + iterator.get_stream_state.return_value = underlying_state + + requester.get_authenticator.return_value = NoAuth + url_base = "https://airbyte.io" + requester.get_url_base.return_value = url_base + path = "/v1" + requester.get_path.return_value = path + http_method = HttpMethod.GET + requester.get_method.return_value = http_method + backoff_time = 60 + should_retry = ResponseStatus.retry(backoff_time) + requester.should_retry.return_value = should_retry + request_body_json = {"body": "json"} + requester.request_body_json.return_value = request_body_json + + request_body_data = {"body": "data"} + requester.get_request_body_data.return_value = request_body_data + request_body_json = {"body": "json"} + requester.get_request_body_json.return_value = request_body_json + request_kwargs = {"kwarg": "value"} + requester.request_kwargs.return_value = request_kwargs + cache_filename = "cache" + requester.cache_filename = cache_filename + use_cache = True + requester.use_cache = use_cache + + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + paginator=paginator, + record_selector=record_selector, + stream_slicer=iterator, + options={}, + ) + + assert retriever.primary_key == primary_key + assert retriever.url_base == url_base + assert retriever.path() == path + assert retriever.state == underlying_state + assert retriever.next_page_token(response) == next_page_token + assert retriever.request_params(None, None, None) == request_params + assert retriever.stream_slices(sync_mode=SyncMode.incremental) == stream_slices + + assert retriever._last_response is None + assert retriever._last_records is None + assert retriever.parse_response(response, stream_state=None) == records + assert retriever._last_response == response + assert retriever._last_records == records + + assert retriever.http_method == "GET" + assert not retriever.raise_on_http_errors + assert retriever.should_retry(requests.Response()) + assert retriever.backoff_time(requests.Response()) == backoff_time + assert retriever.request_body_json(None, None, None) == request_body_json + assert retriever.request_kwargs(None, None, None) == request_kwargs + assert retriever.cache_filename == cache_filename + assert retriever.use_cache == use_cache + + [r for r in retriever.read_records(SyncMode.full_refresh)] + paginator.reset.assert_called() + + +@pytest.mark.parametrize( + "test_name, requester_response, expected_should_retry, expected_backoff_time", + [ + ("test_should_retry_fail", response_status.FAIL, False, None), + ("test_should_retry_none_backoff", ResponseStatus.retry(None), True, None), + ("test_should_retry_custom_backoff", ResponseStatus.retry(60), True, 60), + ], +) +def test_should_retry(test_name, requester_response, expected_should_retry, expected_backoff_time): + requester = MagicMock() + retriever = SimpleRetriever(name="stream_name", primary_key=primary_key, requester=requester, record_selector=MagicMock(), options={}) + requester.should_retry.return_value = requester_response + assert retriever.should_retry(requests.Response()) == expected_should_retry + if requester_response.action == ResponseAction.RETRY: + assert retriever.backoff_time(requests.Response()) == expected_backoff_time + + +@pytest.mark.parametrize( + "test_name, status_code, response_status, len_expected_records", + [ + ("test_parse_response_fails_if_should_retry_is_fail", 404, response_status.FAIL, None), + ("test_parse_response_succeeds_if_should_retry_is_ok", 200, response_status.SUCCESS, 1), + ("test_parse_response_succeeds_if_should_retry_is_ignore", 404, response_status.IGNORE, 0), + ], +) +def test_parse_response(test_name, status_code, response_status, len_expected_records): + requester = MagicMock() + record_selector = MagicMock() + record_selector.select_records.return_value = [{"id": 100}] + retriever = SimpleRetriever( + name="stream_name", primary_key=primary_key, requester=requester, record_selector=record_selector, options={} + ) + response = requests.Response() + response.status_code = status_code + requester.should_retry.return_value = response_status + if len_expected_records is None: + try: + retriever.parse_response(response, stream_state={}) + assert False + except ReadException: + pass + else: + records = retriever.parse_response(response, stream_state={}) + assert len(records) == len_expected_records + + +@pytest.mark.parametrize( + "test_name, response_action, retry_in, expected_backoff_time", + [ + ("test_backoff_retriable_request", ResponseAction.RETRY, 10, 10), + ("test_backoff_fail_request", ResponseAction.FAIL, 10, None), + ("test_backoff_ignore_request", ResponseAction.IGNORE, 10, None), + ("test_backoff_success_request", ResponseAction.IGNORE, 10, None), + ], +) +def test_backoff_time(test_name, response_action, retry_in, expected_backoff_time): + requester = MagicMock() + record_selector = MagicMock() + record_selector.select_records.return_value = [{"id": 100}] + response = requests.Response() + retriever = SimpleRetriever( + name="stream_name", primary_key=primary_key, requester=requester, record_selector=record_selector, options={} + ) + if expected_backoff_time: + requester.should_retry.return_value = ResponseStatus(response_action, retry_in) + actual_backoff_time = retriever.backoff_time(response) + assert expected_backoff_time == actual_backoff_time + else: + try: + retriever.backoff_time(response) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, paginator_mapping, stream_slicer_mapping, expected_mapping", + [ + ("test_only_base_headers", {}, {}, {"key": "value"}), + ("test_header_from_pagination", {"offset": 1000}, {}, {"key": "value", "offset": 1000}), + ("test_header_from_stream_slicer", {}, {"slice": "slice_value"}, {"key": "value", "slice": "slice_value"}), + ("test_duplicate_header_slicer", {}, {"key": "slice_value"}, None), + ("test_duplicate_header_slicer_paginator", {"k": "v"}, {"k": "slice_value"}, None), + ("test_duplicate_header_paginator", {"key": 1000}, {}, None), + ], +) +def test_get_request_options_from_pagination(test_name, paginator_mapping, stream_slicer_mapping, expected_mapping): + # This test does not test request headers because they must be strings + paginator = MagicMock() + paginator.get_request_params.return_value = paginator_mapping + paginator.get_request_body_data.return_value = paginator_mapping + paginator.get_request_body_json.return_value = paginator_mapping + + stream_slicer = MagicMock() + stream_slicer.get_request_params.return_value = stream_slicer_mapping + stream_slicer.get_request_body_data.return_value = stream_slicer_mapping + stream_slicer.get_request_body_json.return_value = stream_slicer_mapping + + base_mapping = {"key": "value"} + requester = MagicMock() + requester.get_request_params.return_value = base_mapping + requester.get_request_body_data.return_value = base_mapping + requester.get_request_body_json.return_value = base_mapping + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + paginator=paginator, + stream_slicer=stream_slicer, + options={}, + ) + + request_option_type_to_method = { + RequestOptionType.request_parameter: retriever.request_params, + RequestOptionType.body_data: retriever.request_body_data, + RequestOptionType.body_json: retriever.request_body_json, + } + + for _, method in request_option_type_to_method.items(): + if expected_mapping: + actual_mapping = method(None, None, None) + assert expected_mapping == actual_mapping + else: + try: + method(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, paginator_mapping, expected_mapping", + [ + ("test_only_base_headers", {}, {"key": "value"}), + ("test_header_from_pagination", {"offset": 1000}, {"key": "value", "offset": "1000"}), + ("test_duplicate_header", {"key": 1000}, None), + ], +) +def test_get_request_headers(test_name, paginator_mapping, expected_mapping): + # This test is separate from the other request options because request headers must be strings + paginator = MagicMock() + paginator.get_request_headers.return_value = paginator_mapping + requester = MagicMock() + + base_mapping = {"key": "value"} + requester.get_request_headers.return_value = base_mapping + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", primary_key=primary_key, requester=requester, record_selector=record_selector, paginator=paginator, options={} + ) + + request_option_type_to_method = { + RequestOptionType.header: retriever.request_headers, + } + + for _, method in request_option_type_to_method.items(): + if expected_mapping: + actual_mapping = method(None, None, None) + assert expected_mapping == actual_mapping + else: + try: + method(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, requester_body_data, paginator_body_data, expected_body_data", + [ + ("test_only_requester_mapping", {"key": "value"}, {}, {"key": "value"}), + ("test_only_requester_string", "key=value", {}, "key=value"), + ("test_requester_mapping_and_paginator_no_duplicate", {"key": "value"}, {"offset": 1000}, {"key": "value", "offset": 1000}), + ("test_requester_mapping_and_paginator_with_duplicate", {"key": "value"}, {"key": 1000}, None), + ("test_requester_string_and_paginator", "key=value", {"offset": 1000}, None), + ], +) +def test_request_body_data(test_name, requester_body_data, paginator_body_data, expected_body_data): + paginator = MagicMock() + paginator.get_request_body_data.return_value = paginator_body_data + requester = MagicMock() + + requester.get_request_body_data.return_value = requester_body_data + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + paginator=paginator, + options={}, + ) + + if expected_body_data: + actual_body_data = retriever.request_body_data(None, None, None) + assert expected_body_data == actual_body_data + else: + try: + retriever.request_body_data(None, None, None) + assert False + except ValueError: + pass + + +@pytest.mark.parametrize( + "test_name, requester_path, paginator_path, expected_path", + [ + ("test_path_from_requester", "/v1/path", None, "/v1/path"), + ("test_path_from_paginator", "/v1/path/", "/v2/paginator", "/v2/paginator"), + ], +) +def test_path(test_name, requester_path, paginator_path, expected_path): + paginator = MagicMock() + paginator.path.return_value = paginator_path + requester = MagicMock() + + requester.get_path.return_value = requester_path + + record_selector = MagicMock() + retriever = SimpleRetriever( + name="stream_name", + primary_key=primary_key, + requester=requester, + record_selector=record_selector, + paginator=paginator, + options={}, + ) + + actual_path = retriever.path(stream_state=None, stream_slice=None, next_page_token=None) + assert expected_path == actual_path diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/states/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py new file mode 100644 index 0000000000000..46b7376756ec6 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/__init__.py @@ -0,0 +1,3 @@ +# +# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py new file mode 100644 index 0000000000000..3ed21485c3c0a --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py @@ -0,0 +1,177 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.cartesian_product_stream_slicer import CartesianProductStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer import DatetimeStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer import ListStreamSlicer + + +@pytest.mark.parametrize( + "test_name, stream_slicers, expected_slices", + [ + ( + "test_single_stream_slicer", + [ListStreamSlicer(slice_values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, options={})], + [{"owner_resource": "customer"}, {"owner_resource": "store"}, {"owner_resource": "subscription"}], + ), + ( + "test_two_stream_slicers", + [ + ListStreamSlicer(slice_values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, options={}), + ListStreamSlicer(slice_values=["A", "B"], cursor_field="letter", config={}, options={}), + ], + [ + {"owner_resource": "customer", "letter": "A"}, + {"owner_resource": "customer", "letter": "B"}, + {"owner_resource": "store", "letter": "A"}, + {"owner_resource": "store", "letter": "B"}, + {"owner_resource": "subscription", "letter": "A"}, + {"owner_resource": "subscription", "letter": "B"}, + ], + ), + ( + "test_list_and_datetime", + [ + ListStreamSlicer(slice_values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, options={}), + DatetimeStreamSlicer( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", options={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", options={}), + step="1d", + cursor_field=InterpolatedString.create("", options={}), + datetime_format="%Y-%m-%d", + config={}, + options={}, + ), + ], + [ + {"owner_resource": "customer", "start_time": "2021-01-01", "end_time": "2021-01-01"}, + {"owner_resource": "customer", "start_time": "2021-01-02", "end_time": "2021-01-02"}, + {"owner_resource": "customer", "start_time": "2021-01-03", "end_time": "2021-01-03"}, + {"owner_resource": "store", "start_time": "2021-01-01", "end_time": "2021-01-01"}, + {"owner_resource": "store", "start_time": "2021-01-02", "end_time": "2021-01-02"}, + {"owner_resource": "store", "start_time": "2021-01-03", "end_time": "2021-01-03"}, + {"owner_resource": "subscription", "start_time": "2021-01-01", "end_time": "2021-01-01"}, + {"owner_resource": "subscription", "start_time": "2021-01-02", "end_time": "2021-01-02"}, + {"owner_resource": "subscription", "start_time": "2021-01-03", "end_time": "2021-01-03"}, + ], + ), + ], +) +def test_substream_slicer(test_name, stream_slicers, expected_slices): + slicer = CartesianProductStreamSlicer(stream_slicers=stream_slicers, options={}) + slices = [s for s in slicer.stream_slices(SyncMode.incremental, stream_state=None)] + assert slices == expected_slices + + +@pytest.mark.parametrize( + "test_name, stream_slice, expected_state", + [ + ("test_update_cursor_no_state_no_record", {}, {}), + ("test_update_cursor_partial_state", {"owner_resource": "customer"}, {"owner_resource": "customer"}), + ( + "test_update_cursor_full_state", + {"owner_resource": "customer", "date": "2021-01-01"}, + {"owner_resource": "customer", "date": "2021-01-01"}, + ), + ], +) +def test_update_cursor(test_name, stream_slice, expected_state): + stream_slicers = [ + ListStreamSlicer(slice_values=["customer", "store", "subscription"], cursor_field="owner_resource", config={}, options={}), + DatetimeStreamSlicer( + start_datetime=MinMaxDatetime(datetime="2021-01-01", datetime_format="%Y-%m-%d", options={}), + end_datetime=MinMaxDatetime(datetime="2021-01-03", datetime_format="%Y-%m-%d", options={}), + step="1d", + cursor_field=InterpolatedString(string="date", options={}), + datetime_format="%Y-%m-%d", + config={}, + options={}, + ), + ] + slicer = CartesianProductStreamSlicer(stream_slicers=stream_slicers, options={}) + slicer.update_cursor(stream_slice, None) + updated_state = slicer.get_stream_state() + assert expected_state == updated_state + + +@pytest.mark.parametrize( + "test_name, stream_1_request_option, stream_2_request_option, expected_req_params, expected_headers,expected_body_json, expected_body_data", + [ + ( + "test_param_header", + RequestOption(inject_into=RequestOptionType.request_parameter, options={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="repo"), + {"owner": "customer"}, + {"repo": "airbyte"}, + {}, + {}, + ), + ( + "test_header_header", + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="repo"), + {}, + {"owner": "customer", "repo": "airbyte"}, + {}, + {}, + ), + ( + "test_body_data", + RequestOption(inject_into=RequestOptionType.body_data, options={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.body_data, options={}, field_name="repo"), + {}, + {}, + {}, + {"owner": "customer", "repo": "airbyte"}, + ), + ( + "test_body_json", + RequestOption(inject_into=RequestOptionType.body_json, options={}, field_name="owner"), + RequestOption(inject_into=RequestOptionType.body_json, options={}, field_name="repo"), + {}, + {}, + {"owner": "customer", "repo": "airbyte"}, + {}, + ), + ], +) +def test_request_option( + test_name, + stream_1_request_option, + stream_2_request_option, + expected_req_params, + expected_headers, + expected_body_json, + expected_body_data, +): + slicer = CartesianProductStreamSlicer( + stream_slicers=[ + ListStreamSlicer( + slice_values=["customer", "store", "subscription"], + cursor_field="owner_resource", + config={}, + request_option=stream_1_request_option, + options={}, + ), + ListStreamSlicer( + slice_values=["airbyte", "airbyte-cloud"], + cursor_field="repository", + config={}, + request_option=stream_2_request_option, + options={}, + ), + ], + options={}, + ) + slicer.update_cursor({"owner_resource": "customer", "repository": "airbyte"}, None) + + assert expected_req_params == slicer.get_request_params() + assert expected_headers == slicer.get_request_headers() + assert expected_body_json == slicer.get_request_body_json() + assert expected_body_data == slicer.get_request_body_data() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_datetime_stream_slicer.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_datetime_stream_slicer.py new file mode 100644 index 0000000000000..e2321ad607f21 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_datetime_stream_slicer.py @@ -0,0 +1,505 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime +import unittest + +import pytest +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer import DatetimeStreamSlicer + +datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z" +FAKE_NOW = datetime.datetime(2022, 1, 1, tzinfo=datetime.timezone.utc) + +config = {"start_date": "2021-01-01T00:00:00.000000+0000", "start_date_ymd": "2021-01-01"} +end_date_now = InterpolatedString(string="{{ today_utc() }}", options={}) +cursor_field = "created" +timezone = datetime.timezone.utc + + +@pytest.fixture() +def mock_datetime_now(monkeypatch): + datetime_mock = unittest.mock.MagicMock(wraps=datetime.datetime) + datetime_mock.now.return_value = FAKE_NOW + monkeypatch.setattr(datetime, "datetime", datetime_mock) + + +@pytest.mark.parametrize( + "test_name, stream_state, start, end, step, cursor_field, lookback_window, datetime_format, expected_slices", + [ + ( + "test_1_day", + None, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T00:00:00.000000+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T00:00:00.000000+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T00:00:00.000000+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_2_day", + None, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "2d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-02T00:00:00.000000+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_from_stream_state", + {"date": "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="{{ stream_state['date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_12_day", + None, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "12d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_end_time_greater_than_now", + None, + MinMaxDatetime(datetime="2021-12-28T00:00:00.000000+0000", options={}), + MinMaxDatetime(datetime=f"{(FAKE_NOW + datetime.timedelta(days=1)).strftime(datetime_format)}", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-12-28T00:00:00.000000+0000", "end_time": "2021-12-28T00:00:00.000000+0000"}, + {"start_time": "2021-12-29T00:00:00.000000+0000", "end_time": "2021-12-29T00:00:00.000000+0000"}, + {"start_time": "2021-12-30T00:00:00.000000+0000", "end_time": "2021-12-30T00:00:00.000000+0000"}, + {"start_time": "2021-12-31T00:00:00.000000+0000", "end_time": "2021-12-31T00:00:00.000000+0000"}, + {"start_time": "2022-01-01T00:00:00.000000+0000", "end_time": "2022-01-01T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_date_greater_than_end_time", + None, + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + MinMaxDatetime(datetime="2021-01-05T00:00:00.000000+0000", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_cursor_date_greater_than_start_date", + {"date": "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="{{ stream_state['date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "1d", + InterpolatedString(string="{{ stream_state['date'] }}", options={}), + None, + datetime_format, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_cursor_date_greater_than_start_date_multiday_step", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-03T00:00:00.000000+0000", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "2d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_date_less_than_min_date", + {"date": "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="{{ config['start_date'] }}", min_datetime="{{ stream_state['date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "1d", + InterpolatedString(string="{{ stream_state['date'] }}", options={}), + None, + datetime_format, + [ + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ( + "test_end_date_greater_than_max_date", + {"date": "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", max_datetime="{{ stream_state['date'] }}", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T00:00:00.000000+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T00:00:00.000000+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T00:00:00.000000+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_end_min_max_inherits_datetime_format_from_stream_slicer", + {"date": "2021-01-05"}, + MinMaxDatetime(datetime="{{ config['start_date_ymd'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10", max_datetime="{{ stream_state['date'] }}", options={}), + "1d", + cursor_field, + None, + "%Y-%m-%d", + [ + {"start_time": "2021-01-01", "end_time": "2021-01-01"}, + {"start_time": "2021-01-02", "end_time": "2021-01-02"}, + {"start_time": "2021-01-03", "end_time": "2021-01-03"}, + {"start_time": "2021-01-04", "end_time": "2021-01-04"}, + {"start_time": "2021-01-05", "end_time": "2021-01-05"}, + ], + ), + ( + "test_with_lookback_window_from_start_date", + {"date": "2021-01-05"}, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10", max_datetime="{{ stream_state['date'] }}", datetime_format="%Y-%m-%d", options={}), + "1d", + cursor_field, + "3d", + datetime_format, + [ + {"start_time": "2020-12-29T00:00:00.000000+0000", "end_time": "2020-12-29T00:00:00.000000+0000"}, + {"start_time": "2020-12-30T00:00:00.000000+0000", "end_time": "2020-12-30T00:00:00.000000+0000"}, + {"start_time": "2020-12-31T00:00:00.000000+0000", "end_time": "2020-12-31T00:00:00.000000+0000"}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T00:00:00.000000+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T00:00:00.000000+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T00:00:00.000000+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_with_lookback_window_defaults_to_0d", + {"date": "2021-01-05"}, + MinMaxDatetime(datetime="{{ config['start_date'] }}", options={}), + MinMaxDatetime(datetime="2021-01-10", max_datetime="{{ stream_state['date'] }}", datetime_format="%Y-%m-%d", options={}), + "1d", + cursor_field, + "{{ config['does_not_exist'] }}", + datetime_format, + [ + {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-01T00:00:00.000000+0000"}, + {"start_time": "2021-01-02T00:00:00.000000+0000", "end_time": "2021-01-02T00:00:00.000000+0000"}, + {"start_time": "2021-01-03T00:00:00.000000+0000", "end_time": "2021-01-03T00:00:00.000000+0000"}, + {"start_time": "2021-01-04T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"}, + {"start_time": "2021-01-05T00:00:00.000000+0000", "end_time": "2021-01-05T00:00:00.000000+0000"}, + ], + ), + ( + "test_start_is_after_stream_state", + {cursor_field: "2021-01-05T00:00:00.000000+0000"}, + MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", options={}), + MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + "1d", + cursor_field, + None, + datetime_format, + [ + {"start_time": "2021-01-06T00:00:00.000000+0000", "end_time": "2021-01-06T00:00:00.000000+0000"}, + {"start_time": "2021-01-07T00:00:00.000000+0000", "end_time": "2021-01-07T00:00:00.000000+0000"}, + {"start_time": "2021-01-08T00:00:00.000000+0000", "end_time": "2021-01-08T00:00:00.000000+0000"}, + {"start_time": "2021-01-09T00:00:00.000000+0000", "end_time": "2021-01-09T00:00:00.000000+0000"}, + {"start_time": "2021-01-10T00:00:00.000000+0000", "end_time": "2021-01-10T00:00:00.000000+0000"}, + ], + ), + ], +) +def test_stream_slices( + mock_datetime_now, test_name, stream_state, start, end, step, cursor_field, lookback_window, datetime_format, expected_slices +): + lookback_window = InterpolatedString(string=lookback_window, options={}) if lookback_window else None + slicer = DatetimeStreamSlicer( + start_datetime=start, + end_datetime=end, + step=step, + cursor_field=cursor_field, + datetime_format=datetime_format, + lookback_window=lookback_window, + config=config, + options={}, + ) + stream_slices = slicer.stream_slices(SyncMode.incremental, stream_state) + + assert expected_slices == stream_slices + + +@pytest.mark.parametrize( + "test_name, previous_cursor, stream_slice, last_record, expected_state", + [ + ("test_update_cursor_no_state_no_record", None, {}, None, {}), + ( + "test_update_cursor_with_state_no_record", + None, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + None, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + ), + ( + "test_update_cursor_with_state_equals_record", + None, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + ), + ( + "test_update_cursor_with_state_greater_than_record", + None, + {cursor_field: "2021-01-03T00:00:00.000000+0000"}, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + {cursor_field: "2021-01-03T00:00:00.000000+0000"}, + ), + ( + "test_update_cursor_with_state_less_than_record", + None, + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + {cursor_field: "2021-01-03T00:00:00.000000+0000"}, + {cursor_field: "2021-01-03T00:00:00.000000+0000"}, + ), + ( + "test_update_cursor_with_state_less_than_previous_cursor", + "2021-01-03T00:00:00.000000+0000", + {cursor_field: "2021-01-02T00:00:00.000000+0000"}, + {}, + {cursor_field: "2021-01-03T00:00:00.000000+0000"}, + ), + ], +) +def test_update_cursor(test_name, previous_cursor, stream_slice, last_record, expected_state): + slicer = DatetimeStreamSlicer( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", options={}), + end_datetime=MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + step="1d", + cursor_field=InterpolatedString(string=cursor_field, options={}), + datetime_format=datetime_format, + lookback_window=InterpolatedString(string="0d", options={}), + config=config, + options={}, + ) + slicer._cursor = previous_cursor + slicer.update_cursor(stream_slice, last_record) + updated_state = slicer.get_stream_state() + assert expected_state == updated_state + + +@pytest.mark.parametrize( + "test_name, inject_into, field_name, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ("test_start_time_inject_into_none", None, None, {}, {}, {}, {}), + ( + "test_start_time_passed_by_req_param", + RequestOptionType.request_parameter, + "start_time", + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + {}, + {}, + ), + ( + "test_start_time_inject_into_header", + RequestOptionType.header, + "start_time", + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + {}, + ), + ( + "test_start_time_inject_intoy_body_json", + RequestOptionType.body_json, + "start_time", + {}, + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + {}, + ), + ( + "test_start_time_inject_into_body_data", + RequestOptionType.body_data, + "start_time", + {}, + {}, + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + ), + ( + "test_start_time_inject_into_path", + RequestOptionType.path, + "start_time", + {}, + {}, + {}, + {"start_time": "2021-01-01T00:00:00.000000+0000", "endtime": "2021-01-04T00:00:00.000000+0000"}, + ), + ], +) +def test_request_option(test_name, inject_into, field_name, expected_req_params, expected_headers, expected_body_json, expected_body_data): + if inject_into == RequestOptionType.path: + start_request_option = RequestOption(inject_into=inject_into, options={}) + end_request_option = RequestOption(inject_into=inject_into, options={}) + try: + DatetimeStreamSlicer( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", options={}), + end_datetime=MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + step="1d", + cursor_field=InterpolatedString(string=cursor_field, options={}), + datetime_format=datetime_format, + lookback_window=InterpolatedString(string="0d", options={}), + start_time_option=start_request_option, + end_time_option=end_request_option, + config=config, + options={}, + ) + assert False + except ValueError: + return + else: + start_request_option = RequestOption(inject_into=inject_into, options={}, field_name=field_name) if inject_into else None + end_request_option = RequestOption(inject_into=inject_into, options={}, field_name="endtime") if inject_into else None + slicer = DatetimeStreamSlicer( + start_datetime=MinMaxDatetime(datetime="2021-01-01T00:00:00.000000+0000", options={}), + end_datetime=MinMaxDatetime(datetime="2021-01-10T00:00:00.000000+0000", options={}), + step="1d", + cursor_field=InterpolatedString(string=cursor_field, options={}), + datetime_format=datetime_format, + lookback_window=InterpolatedString(string="0d", options={}), + start_time_option=start_request_option, + end_time_option=end_request_option, + config=config, + options={}, + ) + stream_slice = {"start_time": "2021-01-01T00:00:00.000000+0000", "end_time": "2021-01-04T00:00:00.000000+0000"} + + slicer.update_cursor(stream_slice) + + assert expected_req_params == slicer.get_request_params(stream_slice=stream_slice) + assert expected_headers == slicer.get_request_headers(stream_slice=stream_slice) + assert expected_body_json == slicer.get_request_body_json(stream_slice=stream_slice) + assert expected_body_data == slicer.get_request_body_data(stream_slice=stream_slice) + + +@pytest.mark.parametrize( + "test_name, input_date, date_format, expected_output_date", + [ + ( + "test_parse_date_iso", + "2021-01-01T00:00:00.000000+0000", + "%Y-%m-%dT%H:%M:%S.%f%z", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ("test_parse_date_number", "20210101", "%Y%m%d", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc)), + ( + "test_parse_date_datetime", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + "%Y%m%d", + datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), + ), + ], +) +def test_parse_date(test_name, input_date, date_format, expected_output_date): + slicer = DatetimeStreamSlicer( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", options={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", options={}), + step="1d", + cursor_field=InterpolatedString(cursor_field, options={}), + datetime_format=date_format, + lookback_window=InterpolatedString("0d", options={}), + config=config, + options={}, + ) + output_date = slicer.parse_date(input_date) + assert expected_output_date == output_date + + +@pytest.mark.parametrize( + "test_name, input_dt, datetimeformat, expected_output", + [ + ("test_format_timestamp", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%s", "1609459200"), + ("test_format_string", datetime.datetime(2021, 1, 1, 0, 0, tzinfo=datetime.timezone.utc), "%Y-%m-%d", "2021-01-01"), + ], +) +def test_format_datetime(test_name, input_dt, datetimeformat, expected_output): + slicer = DatetimeStreamSlicer( + start_datetime=MinMaxDatetime("2021-01-01T00:00:00.000000+0000", options={}), + end_datetime=MinMaxDatetime("2021-01-10T00:00:00.000000+0000", options={}), + step="1d", + cursor_field=InterpolatedString(cursor_field, options={}), + datetime_format=datetimeformat, + lookback_window=InterpolatedString("0d", options={}), + config=config, + options={}, + ) + + output_date = slicer._format_datetime(input_dt) + assert expected_output == output_date + + +if __name__ == "__main__": + unittest.main() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_list_stream_slicer.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_list_stream_slicer.py new file mode 100644 index 0000000000000..1245a7c14ba00 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_list_stream_slicer.py @@ -0,0 +1,118 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest as pytest +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer import ListStreamSlicer + +slice_values = ["customer", "store", "subscription"] +cursor_field = "owner_resource" +options = {"cursor_field": "owner_resource"} + + +@pytest.mark.parametrize( + "test_name, slice_values, cursor_field, expected_slices", + [ + ( + "test_single_element", + ["customer", "store", "subscription"], + "owner_resource", + [{"owner_resource": "customer"}, {"owner_resource": "store"}, {"owner_resource": "subscription"}], + ), + ( + "test_input_list_is_string", + '["customer", "store", "subscription"]', + "owner_resource", + [{"owner_resource": "customer"}, {"owner_resource": "store"}, {"owner_resource": "subscription"}], + ), + ( + "test_using_cursor_from_options", + '["customer", "store", "subscription"]', + "{{ options['cursor_field'] }}", + [{"owner_resource": "customer"}, {"owner_resource": "store"}, {"owner_resource": "subscription"}], + ), + ], +) +def test_list_stream_slicer(test_name, slice_values, cursor_field, expected_slices): + slicer = ListStreamSlicer(slice_values=slice_values, cursor_field=cursor_field, config={}, options=options) + slices = [s for s in slicer.stream_slices(SyncMode.incremental, stream_state=None)] + assert slices == expected_slices + + +@pytest.mark.parametrize( + "test_name, stream_slice, last_record, expected_state", + [ + ("test_update_cursor_no_state_no_record", {}, None, {}), + ("test_update_cursor_with_state_no_record", {"owner_resource": "customer"}, None, {"owner_resource": "customer"}), + ("test_update_cursor_value_not_in_list", {"owner_resource": "invalid"}, None, {}), + ], +) +def test_update_cursor(test_name, stream_slice, last_record, expected_state): + slicer = ListStreamSlicer(slice_values=slice_values, cursor_field=cursor_field, config={}, options={}) + slicer.update_cursor(stream_slice, last_record) + updated_state = slicer.get_stream_state() + assert expected_state == updated_state + + +@pytest.mark.parametrize( + "test_name, request_option, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ( + "test_inject_into_req_param", + RequestOption(inject_into=RequestOptionType.request_parameter, options={}, field_name="owner_resource"), + {"owner_resource": "customer"}, + {}, + {}, + {}, + ), + ( + "test_pass_by_header", + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="owner_resource"), + {}, + {"owner_resource": "customer"}, + {}, + {}, + ), + ( + "test_inject_into_body_json", + RequestOption(inject_into=RequestOptionType.body_json, options={}, field_name="owner_resource"), + {}, + {}, + {"owner_resource": "customer"}, + {}, + ), + ( + "test_inject_into_body_data", + RequestOption(inject_into=RequestOptionType.body_data, options={}, field_name="owner_resource"), + {}, + {}, + {}, + {"owner_resource": "customer"}, + ), + ( + "test_inject_into_path", + RequestOption(RequestOptionType.path, {}), + {}, + {}, + {}, + {"owner_resource": "customer"}, + ), + ], +) +def test_request_option(test_name, request_option, expected_req_params, expected_headers, expected_body_json, expected_body_data): + if request_option.inject_into == RequestOptionType.path: + try: + ListStreamSlicer(slice_values=slice_values, cursor_field=cursor_field, config={}, request_option=request_option, options={}) + assert False + except ValueError: + return + slicer = ListStreamSlicer(slice_values=slice_values, cursor_field=cursor_field, config={}, request_option=request_option, options={}) + stream_slice = {cursor_field: "customer"} + + slicer.update_cursor(stream_slice) + assert expected_req_params == slicer.get_request_params(stream_slice) + assert expected_headers == slicer.get_request_headers() + assert expected_body_json == slicer.get_request_body_json() + assert expected_body_data == slicer.get_request_body_data() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_substream_slicer.py b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_substream_slicer.py new file mode 100644 index 0000000000000..d4c8d5ad1f1f4 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/stream_slicers/test_substream_slicer.py @@ -0,0 +1,264 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Any, Iterable, List, Mapping, Optional, Union + +import pytest as pytest +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.stream_slicers.substream_slicer import ParentStreamConfig, SubstreamSlicer +from airbyte_cdk.sources.streams.core import Stream + +parent_records = [{"id": 1, "data": "data1"}, {"id": 2, "data": "data2"}] +more_records = [{"id": 10, "data": "data10", "slice": "second_parent"}, {"id": 20, "data": "data20", "slice": "second_parent"}] + +data_first_parent_slice = [{"id": 0, "slice": "first", "data": "A"}, {"id": 1, "slice": "first", "data": "B"}] +data_second_parent_slice = [{"id": 2, "slice": "second", "data": "C"}] +data_third_parent_slice = [] +all_parent_data = data_first_parent_slice + data_second_parent_slice + data_third_parent_slice +parent_slices = [{"slice": "first"}, {"slice": "second"}, {"slice": "third"}] +second_parent_stream_slice = [{"slice": "second_parent"}] + + +class MockStream(Stream): + def __init__(self, slices, records, name): + self._slices = slices + self._records = records + self._name = name + + @property + def name(self) -> str: + return self._name + + @property + def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: + return "id" + + def stream_slices( + self, *, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None + ) -> Iterable[Optional[Mapping[str, Any]]]: + yield from self._slices + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + if not stream_slice: + yield from self._records + else: + yield from [r for r in self._records if r["slice"] == stream_slice["slice"]] + + +@pytest.mark.parametrize( + "test_name, parent_stream_configs, expected_slices", + [ + ("test_no_parents", [], None), + ( + "test_single_parent_slices_no_records", + [ + ParentStreamConfig( + stream=MockStream([{}], [], "first_stream"), parent_key="id", stream_slice_field="first_stream_id", options={} + ) + ], + [{"first_stream_id": None, "parent_slice": None}], + ), + ( + "test_single_parent_slices_with_records", + [ + ParentStreamConfig( + stream=MockStream([{}], parent_records, "first_stream"), + parent_key="id", + stream_slice_field="first_stream_id", + options={}, + ) + ], + [{"first_stream_id": 1, "parent_slice": None}, {"first_stream_id": 2, "parent_slice": None}], + ), + ( + "test_with_parent_slices_and_records", + [ + ParentStreamConfig( + stream=MockStream(parent_slices, all_parent_data, "first_stream"), + parent_key="id", + stream_slice_field="first_stream_id", + options={}, + ) + ], + [ + {"parent_slice": "first", "first_stream_id": 0}, + {"parent_slice": "first", "first_stream_id": 1}, + {"parent_slice": "second", "first_stream_id": 2}, + {"parent_slice": "third", "first_stream_id": None}, + ], + ), + ( + "test_multiple_parent_streams", + [ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + stream_slice_field="first_stream_id", + options={}, + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + stream_slice_field="second_stream_id", + options={}, + ), + ], + [ + {"parent_slice": "first", "first_stream_id": 0}, + {"parent_slice": "first", "first_stream_id": 1}, + {"parent_slice": "second", "first_stream_id": 2}, + {"parent_slice": "third", "first_stream_id": None}, + {"parent_slice": "second_parent", "second_stream_id": 10}, + {"parent_slice": "second_parent", "second_stream_id": 20}, + ], + ), + ], +) +def test_substream_slicer(test_name, parent_stream_configs, expected_slices): + if expected_slices is None: + try: + SubstreamSlicer(parent_stream_configs=parent_stream_configs, options={}) + assert False + except ValueError: + return + slicer = SubstreamSlicer(parent_stream_configs=parent_stream_configs, options={}) + slices = [s for s in slicer.stream_slices(SyncMode.incremental, stream_state=None)] + assert slices == expected_slices + + +@pytest.mark.parametrize( + "test_name, stream_slice, expected_state", + [ + ("test_update_cursor_no_state_no_record", {}, {}), + ("test_update_cursor_with_state_single_parent", {"first_stream_id": "1234"}, {"first_stream_id": "1234"}), + ("test_update_cursor_with_unknown_state_field", {"unknown_stream_id": "1234"}, {}), + ( + "test_update_cursor_with_state_from_both_parents", + {"first_stream_id": "1234", "second_stream_id": "4567"}, + {"first_stream_id": "1234", "second_stream_id": "4567"}, + ), + ], +) +def test_update_cursor(test_name, stream_slice, expected_state): + parent_stream_name_to_config = [ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + stream_slice_field="first_stream_id", + options={}, + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + stream_slice_field="second_stream_id", + options={}, + ), + ] + + slicer = SubstreamSlicer(parent_stream_configs=parent_stream_name_to_config, options={}) + slicer.update_cursor(stream_slice, None) + updated_state = slicer.get_stream_state() + assert expected_state == updated_state + + +@pytest.mark.parametrize( + "test_name, parent_stream_request_options, expected_req_params, expected_headers, expected_body_json, expected_body_data", + [ + ( + "test_request_option_in_request_param", + [ + RequestOption(inject_into=RequestOptionType.request_parameter, options={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.request_parameter, options={}, field_name="second_stream"), + ], + {"first_stream_id": "1234", "second_stream_id": "4567"}, + {}, + {}, + {}, + ), + ( + "test_request_option_in_header", + [ + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="second_stream"), + ], + {}, + {"first_stream_id": "1234", "second_stream_id": "4567"}, + {}, + {}, + ), + ( + "test_request_option_in_param_and_header", + [ + RequestOption(inject_into=RequestOptionType.request_parameter, options={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.header, options={}, field_name="second_stream"), + ], + {"first_stream_id": "1234"}, + {"second_stream_id": "4567"}, + {}, + {}, + ), + ( + "test_request_option_in_body_json", + [ + RequestOption(inject_into=RequestOptionType.body_json, options={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.body_json, options={}, field_name="second_stream"), + ], + {}, + {}, + {"first_stream_id": "1234", "second_stream_id": "4567"}, + {}, + ), + ( + "test_request_option_in_body_data", + [ + RequestOption(inject_into=RequestOptionType.body_data, options={}, field_name="first_stream"), + RequestOption(inject_into=RequestOptionType.body_data, options={}, field_name="second_stream"), + ], + {}, + {}, + {}, + {"first_stream_id": "1234", "second_stream_id": "4567"}, + ), + ], +) +def test_request_option( + test_name, + parent_stream_request_options, + expected_req_params, + expected_headers, + expected_body_json, + expected_body_data, +): + slicer = SubstreamSlicer( + parent_stream_configs=[ + ParentStreamConfig( + stream=MockStream(parent_slices, data_first_parent_slice + data_second_parent_slice, "first_stream"), + parent_key="id", + stream_slice_field="first_stream_id", + options={}, + request_option=parent_stream_request_options[0], + ), + ParentStreamConfig( + stream=MockStream(second_parent_stream_slice, more_records, "second_stream"), + parent_key="id", + stream_slice_field="second_stream_id", + options={}, + request_option=parent_stream_request_options[1], + ), + ], + options={}, + ) + slicer.update_cursor({"first_stream_id": "1234", "second_stream_id": "4567"}, None) + + assert expected_req_params == slicer.get_request_params() + assert expected_headers == slicer.get_request_headers() + assert expected_body_json == slicer.get_request_body_json() + assert expected_body_data == slicer.get_request_body_data() diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_create_partial.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_create_partial.py new file mode 100644 index 0000000000000..3ba79ab81e7dd --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_create_partial.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from airbyte_cdk.sources.declarative.create_partial import create +from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString + + +class AClass: + def __init__(self, parameter, another_param, options): + self.parameter = parameter + self.another_param = another_param + self.options = options + + +class OuterClass: + def __init__(self, name, some_field, inner_param): + self.name = name + self.some_field = some_field + self.inner_param = inner_param + + +class OuterOuterClass: + def __init__(self, name, param, inner_class): + self.name = name + self.param = param + self.inner_class = inner_class + + +def test_pass_parameter_to_create_function(): + object = create(AClass, parameter="A")(another_param="B") + assert object.parameter == "A" + assert object.another_param == "B" + + +def test_overwrite_param(): + object = create(AClass, parameter="A", another_param="B")(parameter="C") + assert object.parameter == "C" + assert object.another_param == "B" + + +def test_string_interpolation(): + s = "{{ next_page_token['next_page_url'] }}" + partial = create(InterpolatedString, string=s) + interpolated_string = partial() + assert interpolated_string.string == s + + +def test_string_interpolation_through_kwargs(): + s = "{{ options['name'] }}" + options = {"name": "airbyte"} + partial = create(InterpolatedString, string=s, **options) + interpolated_string = partial() + assert interpolated_string.eval({}) == "airbyte" + + +def test_string_interpolation_through_options_keyword(): + s = "{{ options['name'] }}" + options = {"$options": {"name": "airbyte"}} + partial = create(InterpolatedString, string=s, **options) + interpolated_string = partial() + assert interpolated_string.eval({}) == "airbyte" diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py new file mode 100644 index 0000000000000..1b6b849062713 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_declarative_stream.py @@ -0,0 +1,62 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from unittest import mock +from unittest.mock import MagicMock, call + +from airbyte_cdk.models import SyncMode +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.transformations import RecordTransformation + + +def test_declarative_stream(): + name = "stream" + primary_key = "pk" + cursor_field = ["created_at"] + + schema_loader = MagicMock() + json_schema = {"name": {"type": "string"}} + schema_loader.get_json_schema.return_value = json_schema + + state = MagicMock() + records = [{"pk": 1234, "field": "value"}, {"pk": 4567, "field": "different_value"}] + stream_slices = [{"date": "2021-01-01"}, {"date": "2021-01-02"}, {"date": "2021-01-03"}] + checkpoint_interval = 1000 + + retriever = MagicMock() + retriever.state = state + retriever.read_records.return_value = records + retriever.stream_slices.return_value = stream_slices + + no_op_transform = mock.create_autospec(spec=RecordTransformation) + no_op_transform.transform = MagicMock(side_effect=lambda record, config, stream_slice, stream_state: record) + transformations = [no_op_transform] + + config = {"api_key": "open_sesame"} + + stream = DeclarativeStream( + name=name, + primary_key=primary_key, + stream_cursor_field=cursor_field, + schema_loader=schema_loader, + retriever=retriever, + config=config, + transformations=transformations, + checkpoint_interval=checkpoint_interval, + options={}, + ) + + assert stream.name == name + assert stream.get_json_schema() == json_schema + assert stream.state == state + input_slice = stream_slices[0] + assert list(stream.read_records(SyncMode.full_refresh, cursor_field, input_slice, state)) == records + assert stream.primary_key == primary_key + assert stream.cursor_field == cursor_field + assert stream.stream_slices(sync_mode=SyncMode.incremental, cursor_field=cursor_field, stream_state=None) == stream_slices + assert stream.state_checkpoint_interval == checkpoint_interval + for transformation in transformations: + assert len(transformation.transform.call_args_list) == len(records) + expected_calls = [call(record, config=config, stream_slice=input_slice, stream_state=state) for record in records] + transformation.transform.assert_has_calls(expected_calls, any_order=False) diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/test_factory.py b/airbyte-cdk/python/unit_tests/sources/declarative/test_factory.py new file mode 100644 index 0000000000000..190c448460475 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/test_factory.py @@ -0,0 +1,582 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import datetime + +from airbyte_cdk.sources.declarative.auth.token import BasicHttpAuthenticator +from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDatetime +from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream +from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder +from airbyte_cdk.sources.declarative.extractors.jello import JelloExtractor +from airbyte_cdk.sources.declarative.extractors.record_filter import RecordFilter +from airbyte_cdk.sources.declarative.extractors.record_selector import RecordSelector +from airbyte_cdk.sources.declarative.interpolation import InterpolatedString +from airbyte_cdk.sources.declarative.parsers.factory import DeclarativeComponentFactory +from airbyte_cdk.sources.declarative.parsers.yaml_parser import YamlParser +from airbyte_cdk.sources.declarative.requesters.error_handlers.composite_error_handler import CompositeErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.default_error_handler import DefaultErrorHandler +from airbyte_cdk.sources.declarative.requesters.error_handlers.http_response_filter import HttpResponseFilter +from airbyte_cdk.sources.declarative.requesters.http_requester import HttpRequester +from airbyte_cdk.sources.declarative.requesters.paginators.limit_paginator import LimitPaginator +from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType +from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider import ( + InterpolatedRequestOptionsProvider, +) +from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod +from airbyte_cdk.sources.declarative.retrievers.simple_retriever import SimpleRetriever +from airbyte_cdk.sources.declarative.schema.json_schema import JsonSchema +from airbyte_cdk.sources.declarative.stream_slicers.datetime_stream_slicer import DatetimeStreamSlicer +from airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer import ListStreamSlicer +from airbyte_cdk.sources.declarative.transformations import AddFields, RemoveFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition + +factory = DeclarativeComponentFactory() + +parser = YamlParser() + +input_config = {"apikey": "verysecrettoken", "repos": ["airbyte", "airbyte-cloud"]} + + +def test_factory(): + content = """ + limit: 50 + offset_request_parameters: + offset: "{{ next_page_token['offset'] }}" + limit: "*ref(limit)" + request_options: + $options: + here: "iam" + class_name: airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider.InterpolatedRequestOptionsProvider + request_parameters: "*ref(offset_request_parameters)" + request_body_json: + body_offset: "{{ next_page_token['offset'] }}" + """ + config = parser.parse(content) + request_options_provider = factory.create_component(config["request_options"], input_config)() + + assert type(request_options_provider) == InterpolatedRequestOptionsProvider + assert request_options_provider._parameter_interpolator._config == input_config + assert request_options_provider._parameter_interpolator._interpolator.mapping["offset"] == "{{ next_page_token['offset'] }}" + assert request_options_provider._body_json_interpolator._config == input_config + assert request_options_provider._body_json_interpolator._interpolator.mapping["body_offset"] == "{{ next_page_token['offset'] }}" + + +def test_interpolate_config(): + content = """ + authenticator: + class_name: airbyte_cdk.sources.declarative.auth.oauth.DeclarativeOauth2Authenticator + client_id: "some_client_id" + client_secret: "some_client_secret" + token_refresh_endpoint: "https://api.sendgrid.com/v3/auth" + refresh_token: "{{ config['apikey'] }}" + refresh_request_body: + body_field: "yoyoyo" + interpolated_body_field: "{{ config['apikey'] }}" + """ + config = parser.parse(content) + authenticator = factory.create_component(config["authenticator"], input_config)() + assert authenticator.client_id.eval(input_config) == "some_client_id" + assert authenticator.client_secret.string == "some_client_secret" + + assert authenticator.token_refresh_endpoint.eval(input_config) == "https://api.sendgrid.com/v3/auth" + assert authenticator.refresh_token.eval(input_config) == "verysecrettoken" + assert authenticator._refresh_request_body.mapping == {"body_field": "yoyoyo", "interpolated_body_field": "{{ config['apikey'] }}"} + assert authenticator.get_refresh_request_body() == {"body_field": "yoyoyo", "interpolated_body_field": "verysecrettoken"} + + +def test_list_based_stream_slicer_with_values_refd(): + content = """ + repositories: ["airbyte", "airbyte-cloud"] + stream_slicer: + class_name: airbyte_cdk.sources.declarative.stream_slicers.list_stream_slicer.ListStreamSlicer + slice_values: "*ref(repositories)" + cursor_field: repository + """ + config = parser.parse(content) + stream_slicer = factory.create_component(config["stream_slicer"], input_config)() + assert ["airbyte", "airbyte-cloud"] == stream_slicer.slice_values + + +def test_list_based_stream_slicer_with_values_defined_in_config(): + content = """ + stream_slicer: + type: ListStreamSlicer + slice_values: "{{config['repos']}}" + cursor_field: repository + request_option: + inject_into: header + field_name: repository + """ + config = parser.parse(content) + stream_slicer = factory.create_component(config["stream_slicer"], input_config)() + assert ["airbyte", "airbyte-cloud"] == stream_slicer.slice_values + assert stream_slicer.request_option.inject_into == RequestOptionType.header + assert stream_slicer.request_option.field_name == "repository" + + +def test_create_substream_slicer(): + content = """ + schema_loader: + file_path: "./source_sendgrid/schemas/{{ options['stream_name'] }}.yaml" + name: "{{ options['stream_name'] }}" + retriever: + requester: + name: "{{ options['stream_name'] }}" + path: "/v3" + record_selector: + extractor: + transform: "_" + stream_A: + type: DeclarativeStream + $options: + stream_name: "A" + stream_primary_key: "id" + retriever: "*ref(retriever)" + url_base: "https://airbyte.io" + schema_loader: "*ref(schema_loader)" + stream_B: + type: DeclarativeStream + $options: + stream_name: "B" + stream_primary_key: "id" + retriever: "*ref(retriever)" + url_base: "https://airbyte.io" + schema_loader: "*ref(schema_loader)" + stream_slicer: + type: SubstreamSlicer + parent_stream_configs: + - stream: "*ref(stream_A)" + parent_key: id + stream_slice_field: repository_id + request_option: + inject_into: request_parameter + field_name: repository_id + - stream: "*ref(stream_B)" + parent_key: someid + stream_slice_field: word_id + """ + config = parser.parse(content) + stream_slicer = factory.create_component(config["stream_slicer"], input_config)() + parent_stream_configs = stream_slicer.parent_stream_configs + assert len(parent_stream_configs) == 2 + assert isinstance(parent_stream_configs[0].stream, DeclarativeStream) + assert isinstance(parent_stream_configs[1].stream, DeclarativeStream) + assert stream_slicer.parent_stream_configs[0].parent_key == "id" + assert stream_slicer.parent_stream_configs[0].stream_slice_field == "repository_id" + assert stream_slicer.parent_stream_configs[0].request_option.inject_into == RequestOptionType.request_parameter + assert stream_slicer.parent_stream_configs[0].request_option.field_name == "repository_id" + + assert stream_slicer.parent_stream_configs[1].parent_key == "someid" + assert stream_slicer.parent_stream_configs[1].stream_slice_field == "word_id" + assert stream_slicer.parent_stream_configs[1].request_option is None + + +def test_create_cartesian_stream_slicer(): + content = """ + stream_slicer_A: + type: ListStreamSlicer + slice_values: "{{config['repos']}}" + cursor_field: repository + stream_slicer_B: + type: ListStreamSlicer + slice_values: + - hello + - world + cursor_field: words + stream_slicer: + type: CartesianProductStreamSlicer + stream_slicers: + - "*ref(stream_slicer_A)" + - "*ref(stream_slicer_B)" + """ + config = parser.parse(content) + stream_slicer = factory.create_component(config["stream_slicer"], input_config)() + underlying_slicers = stream_slicer.stream_slicers + assert len(underlying_slicers) == 2 + assert isinstance(underlying_slicers[0], ListStreamSlicer) + assert isinstance(underlying_slicers[1], ListStreamSlicer) + assert ["airbyte", "airbyte-cloud"] == underlying_slicers[0].slice_values + assert ["hello", "world"] == underlying_slicers[1].slice_values + + +def test_datetime_stream_slicer(): + content = """ + stream_slicer: + type: DatetimeStreamSlicer + $options: + datetime_format: "%Y-%m-%dT%H:%M:%S.%f%z" + start_datetime: + type: MinMaxDatetime + datetime: "{{ config['start_time'] }}" + min_datetime: "{{ config['start_time'] + day_delta(2) }}" + end_datetime: "{{ config['end_time'] }}" + step: "10d" + cursor_field: "created" + lookback_window: "5d" + start_time_option: + inject_into: request_parameter + field_name: created[gte] + """ + + config = parser.parse(content) + stream_slicer = factory.create_component(config["stream_slicer"], input_config)() + assert type(stream_slicer) == DatetimeStreamSlicer + assert stream_slicer._timezone == datetime.timezone.utc + assert type(stream_slicer.start_datetime) == MinMaxDatetime + assert type(stream_slicer.end_datetime) == MinMaxDatetime + assert stream_slicer.start_datetime._datetime_format == "%Y-%m-%dT%H:%M:%S.%f%z" + assert stream_slicer.start_datetime._timezone == datetime.timezone.utc + assert stream_slicer.start_datetime.datetime.string == "{{ config['start_time'] }}" + assert stream_slicer.start_datetime.min_datetime.string == "{{ config['start_time'] + day_delta(2) }}" + assert stream_slicer.end_datetime.datetime.string == "{{ config['end_time'] }}" + assert stream_slicer._step == datetime.timedelta(days=10) + assert stream_slicer.cursor_field.string == "created" + assert stream_slicer.lookback_window.string == "5d" + assert stream_slicer.start_time_option.inject_into == RequestOptionType.request_parameter + assert stream_slicer.start_time_option.field_name == "created[gte]" + + +def test_full_config(): + content = """ +decoder: + class_name: "airbyte_cdk.sources.declarative.decoders.json_decoder.JsonDecoder" +extractor: + class_name: airbyte_cdk.sources.declarative.extractors.jello.JelloExtractor + decoder: "*ref(decoder)" +selector: + class_name: airbyte_cdk.sources.declarative.extractors.record_selector.RecordSelector + record_filter: + class_name: airbyte_cdk.sources.declarative.extractors.record_filter.RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" +metadata_paginator: + type: "LimitPaginator" + page_size: 10 + limit_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + inject_into: path + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + url_base: "https://api.sendgrid.com/v3/" +next_page_url_from_token_partial: + class_name: "airbyte_cdk.sources.declarative.interpolation.interpolated_string.InterpolatedString" + string: "{{ next_page_token['next_page_url'] }}" +request_options_provider: + class_name: airbyte_cdk.sources.declarative.requesters.request_options.interpolated_request_options_provider.InterpolatedRequestOptionsProvider +requester: + class_name: airbyte_cdk.sources.declarative.requesters.http_requester.HttpRequester + name: "{{ options['name'] }}" + url_base: "https://api.sendgrid.com/v3/" + http_method: "GET" + authenticator: + type: BearerAuthenticator + api_token: "{{ config['apikey'] }}" + request_parameters_provider: "*ref(request_options_provider)" + error_handler: + type: DefaultErrorHandler +retriever: + class_name: "airbyte_cdk.sources.declarative.retrievers.simple_retriever.SimpleRetriever" + name: "{{ options['name'] }}" + stream_slicer: + class_name: airbyte_cdk.sources.declarative.stream_slicers.single_slice.SingleSlice + paginator: + class_name: airbyte_cdk.sources.declarative.requesters.paginators.no_pagination.NoPagination + primary_key: "{{ options['primary_key'] }}" +partial_stream: + class_name: "airbyte_cdk.sources.declarative.declarative_stream.DeclarativeStream" + schema_loader: + class_name: airbyte_cdk.sources.declarative.schema.json_schema.JsonSchema + file_path: "./source_sendgrid/schemas/{{ options.name }}.json" + cursor_field: [ ] +list_stream: + $ref: "*ref(partial_stream)" + $options: + name: "lists" + primary_key: "id" + extractor: + $ref: "*ref(extractor)" + transform: "_.result" + retriever: + $ref: "*ref(retriever)" + requester: + $ref: "*ref(requester)" + path: + $ref: "*ref(next_page_url_from_token_partial)" + default: "marketing/lists" + paginator: + $ref: "*ref(metadata_paginator)" + record_selector: + $ref: "*ref(selector)" +check: + class_name: airbyte_cdk.sources.declarative.checks.check_stream.CheckStream + stream_names: ["list_stream"] + """ + config = parser.parse(content) + + stream_config = config["list_stream"] + assert stream_config["class_name"] == "airbyte_cdk.sources.declarative.declarative_stream.DeclarativeStream" + assert stream_config["cursor_field"] == [] + stream = factory.create_component(stream_config, input_config)() + + assert isinstance(stream.retriever.record_selector.extractor, JelloExtractor) + + assert type(stream) == DeclarativeStream + assert stream.primary_key == "id" + assert stream.name == "lists" + assert type(stream.schema_loader) == JsonSchema + assert type(stream.retriever) == SimpleRetriever + assert stream.retriever.requester.http_method == HttpMethod.GET + assert stream.retriever.requester.authenticator._token.eval(input_config) == "verysecrettoken" + assert type(stream.retriever.record_selector) == RecordSelector + assert type(stream.retriever.record_selector.extractor.decoder) == JsonDecoder + + assert stream.retriever.record_selector.extractor.transform.eval(input_config) == "_.result" + assert type(stream.retriever.record_selector.record_filter) == RecordFilter + assert stream.retriever.record_selector.record_filter._filter_interpolator.condition == "{{ record['id'] > stream_state['id'] }}" + assert stream.schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.json" + + checker = factory.create_component(config["check"], input_config)() + streams_to_check = checker.stream_names + assert len(streams_to_check) == 1 + assert list(streams_to_check)[0] == "list_stream" + + assert stream.retriever.requester.path.default == "marketing/lists" + + +def test_create_record_selector(): + content = """ + extractor: + type: JelloExtractor + transform: "_.result" + selector: + class_name: airbyte_cdk.sources.declarative.extractors.record_selector.RecordSelector + record_filter: + class_name: airbyte_cdk.sources.declarative.extractors.record_filter.RecordFilter + condition: "{{ record['id'] > stream_state['id'] }}" + extractor: + $ref: "*ref(extractor)" + transform: "_.result" + """ + config = parser.parse(content) + selector = factory.create_component(config["selector"], input_config)() + assert isinstance(selector, RecordSelector) + assert isinstance(selector.extractor, JelloExtractor) + assert selector.extractor.transform.eval(input_config) == "_.result" + assert isinstance(selector.record_filter, RecordFilter) + + +def test_create_requester(): + content = """ + requester: + type: HttpRequester + path: "/v3/marketing/lists" + $options: + name: 'lists' + url_base: "https://api.sendgrid.com" + authenticator: + type: "BasicHttpAuthenticator" + username: "{{ options.name }}" + password: "{{ config.apikey }}" + request_options_provider: + request_parameters: + page_size: 10 + request_headers: + header: header_value + """ + config = parser.parse(content) + component = factory.create_component(config["requester"], input_config)() + assert isinstance(component, HttpRequester) + assert isinstance(component.error_handler, DefaultErrorHandler) + assert component.path.string == "/v3/marketing/lists" + assert component.url_base.string == "https://api.sendgrid.com" + assert isinstance(component.authenticator, BasicHttpAuthenticator) + assert component.authenticator._username.eval(input_config) == "lists" + assert component.authenticator._password.eval(input_config) == "verysecrettoken" + assert component._method == HttpMethod.GET + assert component._request_options_provider._parameter_interpolator._interpolator.mapping["page_size"] == 10 + assert component._request_options_provider._headers_interpolator._interpolator.mapping["header"] == "header_value" + assert component.name == "lists" + + +def test_create_composite_error_handler(): + content = """ + error_handler: + type: "CompositeErrorHandler" + error_handlers: + - response_filters: + - predicate: "{{ 'code' in response }}" + action: RETRY + - response_filters: + - http_codes: [ 403 ] + action: RETRY + """ + config = parser.parse(content) + component = factory.create_component(config["error_handler"], input_config)() + assert len(component.error_handlers) == 2 + assert isinstance(component.error_handlers[0], DefaultErrorHandler) + assert isinstance(component.error_handlers[0].response_filters[0], HttpResponseFilter) + assert component.error_handlers[0].response_filters[0].predicate.condition == "{{ 'code' in response }}" + assert component.error_handlers[1].response_filters[0].http_codes == [403] + assert isinstance(component, CompositeErrorHandler) + + +def test_config_with_defaults(): + content = """ + lists_stream: + type: "DeclarativeStream" + $options: + name: "lists" + primary_key: id + url_base: "https://api.sendgrid.com" + schema_loader: + name: "{{ options.stream_name }}" + file_path: "./source_sendgrid/schemas/{{ options.name }}.yaml" + retriever: + paginator: + type: "LimitPaginator" + page_size: 10 + limit_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + inject_into: path + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + requester: + path: "/v3/marketing/lists" + authenticator: + type: "BearerAuthenticator" + api_token: "{{ config.apikey }}" + request_parameters: + page_size: 10 + record_selector: + extractor: + transform: "_.result" + streams: + - "*ref(lists_stream)" + """ + config = parser.parse(content) + + stream_config = config["lists_stream"] + stream = factory.create_component(stream_config, input_config)() + assert type(stream) == DeclarativeStream + assert stream.primary_key == "id" + assert stream.name == "lists" + assert type(stream.schema_loader) == JsonSchema + assert type(stream.retriever) == SimpleRetriever + assert stream.retriever.requester.http_method == HttpMethod.GET + + assert stream.retriever.requester.authenticator._token.eval(input_config) == "verysecrettoken" + assert stream.retriever.record_selector.extractor.transform.eval(input_config) == "_.result" + assert stream.schema_loader._get_json_filepath() == "./source_sendgrid/schemas/lists.yaml" + assert isinstance(stream.retriever.paginator, LimitPaginator) + + assert stream.retriever.paginator.url_base.string == "https://api.sendgrid.com" + assert stream.retriever.paginator.page_size == 10 + + +def test_create_limit_paginator(): + content = """ + paginator: + type: "LimitPaginator" + page_size: 10 + url_base: "https://airbyte.io" + limit_option: + inject_into: request_parameter + field_name: page_size + page_token_option: + inject_into: path + pagination_strategy: + type: "CursorPagination" + cursor_value: "{{ response._metadata.next }}" + """ + config = parser.parse(content) + + paginator_config = config["paginator"] + paginator = factory.create_component(paginator_config, input_config)() + assert isinstance(paginator, LimitPaginator) + page_token_option = paginator.page_token_option + assert isinstance(page_token_option, RequestOption) + assert page_token_option.inject_into == RequestOptionType.path + + +class TestCreateTransformations: + # the tabbing matters + base_options = """ + name: "lists" + primary_key: id + url_base: "https://api.sendgrid.com" + schema_loader: + name: "{{ options.name }}" + file_path: "./source_sendgrid/schemas/{{ options.name }}.yaml" + retriever: + requester: + name: "{{ options.name }}" + path: "/v3/marketing/lists" + request_parameters: + page_size: 10 + record_selector: + extractor: + transform: "_.result" + """ + + def test_no_transformations(self): + content = f""" + the_stream: + type: DeclarativeStream + $options: + {self.base_options} + """ + config = parser.parse(content) + component = factory.create_component(config["the_stream"], input_config)() + assert isinstance(component, DeclarativeStream) + assert [] == component.transformations + + def test_remove_fields(self): + content = f""" + the_stream: + type: DeclarativeStream + $options: + {self.base_options} + transformations: + - type: RemoveFields + field_pointers: + - ["path", "to", "field1"] + - ["path2"] + """ + config = parser.parse(content) + component = factory.create_component(config["the_stream"], input_config)() + assert isinstance(component, DeclarativeStream) + expected = [RemoveFields(field_pointers=[["path", "to", "field1"], ["path2"]], options={})] + assert expected == component.transformations + + def test_add_fields(self): + content = f""" + the_stream: + class_name: airbyte_cdk.sources.declarative.declarative_stream.DeclarativeStream + $options: + {self.base_options} + transformations: + - type: AddFields + fields: + - path: ["field1"] + value: "static_value" + """ + config = parser.parse(content) + component = factory.create_component(config["the_stream"], input_config)() + assert isinstance(component, DeclarativeStream) + expected = [ + AddFields( + fields=[ + AddedFieldDefinition( + path=["field1"], value=InterpolatedString(string="static_value", default="static_value", options={}), options={} + ) + ], + options={}, + ) + ] + assert expected == component.transformations diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py new file mode 100644 index 0000000000000..61fb31ba70562 --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_add_fields.py @@ -0,0 +1,110 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping, Tuple + +import pytest +from airbyte_cdk.sources.declarative.transformations import AddFields +from airbyte_cdk.sources.declarative.transformations.add_fields import AddedFieldDefinition +from airbyte_cdk.sources.declarative.types import FieldPointer + + +@pytest.mark.parametrize( + ["input_record", "field", "kwargs", "expected"], + [ + pytest.param({"k": "v"}, [(["path"], "static_value")], {}, {"k": "v", "path": "static_value"}, id="add new static value"), + pytest.param( + {"k": "v"}, + [(["path"], "static_value"), (["path2"], "static_value2")], + {}, + {"k": "v", "path": "static_value", "path2": "static_value2"}, + id="add new multiple static values", + ), + pytest.param( + {"k": "v"}, + [(["nested", "path"], "static_value")], + {}, + {"k": "v", "nested": {"path": "static_value"}}, + id="set static value at nested path", + ), + pytest.param({"k": "v"}, [(["k"], "new_value")], {}, {"k": "new_value"}, id="update value which already exists"), + pytest.param({"k": [0, 1]}, [(["k", 3], "v")], {}, {"k": [0, 1, None, "v"]}, id="Set element inside array"), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ config["shop"] }}')], + {"config": {"shop": "in-n-out"}}, + {"k": "v", "k2": "in-n-out"}, + id="set a value from the config using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ config.shop }}")], + {"config": {"shop": "in-n-out"}}, + {"k": "v", "k2": "in-n-out"}, + id="set a value from the config using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ stream_state["cursor"] }}')], + {"stream_state": {"cursor": "t0"}}, + {"k": "v", "k2": "t0"}, + id="set a value from the state using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ stream_state.cursor }}")], + {"stream_state": {"cursor": "t0"}}, + {"k": "v", "k2": "t0"}, + id="set a value from the state using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ stream_slice["start_date"] }}')], + {"stream_slice": {"start_date": "oct1"}}, + {"k": "v", "k2": "oct1"}, + id="set a value from the stream slice using bracket notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ stream_slice.start_date }}")], + {"stream_slice": {"start_date": "oct1"}}, + {"k": "v", "k2": "oct1"}, + id="set a value from the stream slice using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], "{{ record.k }}")], + {}, + {"k": "v", "k2": "v"}, + id="set a value from a field in the record using dot notation", + ), + pytest.param( + {"k": "v"}, + [(["k2"], '{{ record["k"] }}')], + {}, + {"k": "v", "k2": "v"}, + id="set a value from a field in the record using bracket notation", + ), + pytest.param( + {"k": {"nested": "v"}}, + [(["k2"], "{{ record.k.nested }}")], + {}, + {"k": {"nested": "v"}, "k2": "v"}, + id="set a value from a nested field in the record using bracket notation", + ), + pytest.param( + {"k": {"nested": "v"}}, + [(["k2"], '{{ record["k"]["nested"] }}')], + {}, + {"k": {"nested": "v"}, "k2": "v"}, + id="set a value from a nested field in the record using bracket notation", + ), + pytest.param({"k": "v"}, [(["k2"], "{{ 2 + 2 }}")], {}, {"k": "v", "k2": 4}, id="set a value from a jinja expression"), + ], +) +def test_add_fields( + input_record: Mapping[str, Any], field: List[Tuple[FieldPointer, str]], kwargs: Mapping[str, Any], expected: Mapping[str, Any] +): + inputs = [AddedFieldDefinition(path=v[0], value=v[1], options={}) for v in field] + assert AddFields(fields=inputs, options={"alas": "i live"}).transform(input_record, **kwargs) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py new file mode 100644 index 0000000000000..c1d0358e4cdba --- /dev/null +++ b/airbyte-cdk/python/unit_tests/sources/declarative/transformations/test_remove_fields.py @@ -0,0 +1,48 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +from typing import Any, List, Mapping + +import pytest +from airbyte_cdk.sources.declarative.transformations import RemoveFields +from airbyte_cdk.sources.declarative.types import FieldPointer + + +@pytest.mark.parametrize( + ["input_record", "field_pointers", "expected"], + [ + pytest.param({"k1": "v", "k2": "v"}, [["k1"]], {"k2": "v"}, id="remove a field that exists (flat dict)"), + pytest.param({"k1": "v", "k2": "v"}, [["k3"]], {"k1": "v", "k2": "v"}, id="remove a field that doesn't exist (flat dict)"), + pytest.param({"k1": "v", "k2": "v"}, [["k1"], ["k2"]], {}, id="remove multiple fields that exist (flat dict)"), + # TODO: should we instead splice the element out of the array? I think that's the more intuitive solution + # Otherwise one could just set the field's value to null. + pytest.param({"k1": [1, 2]}, [["k1", 0]], {"k1": [None, 2]}, id="remove field inside array (int index)"), + pytest.param({"k1": [1, 2]}, [["k1", "0"]], {"k1": [None, 2]}, id="remove field inside array (string index)"), + pytest.param( + {"k1": "v", "k2": "v", "k3": [0, 1], "k4": "v"}, + [["k1"], ["k2"], ["k3", 0]], + {"k3": [None, 1], "k4": "v"}, + id="test all cases (flat)", + ), + pytest.param({"k1": [0, 1]}, [[".", "k1", 10]], {"k1": [0, 1]}, id="remove array index that doesn't exist (flat)"), + pytest.param({".": {"k1": [0, 1]}}, [[".", "k1", 10]], {".": {"k1": [0, 1]}}, id="remove array index that doesn't exist (nested)"), + pytest.param({".": {"k2": "v", "k1": "v"}}, [[".", "k1"]], {".": {"k2": "v"}}, id="remove nested field that exists"), + pytest.param( + {".": {"k2": "v", "k1": "v"}}, [[".", "k3"]], {".": {"k2": "v", "k1": "v"}}, id="remove field that doesn't exist (nested)" + ), + pytest.param({".": {"k2": "v", "k1": "v"}}, [[".", "k1"], [".", "k2"]], {".": {}}, id="remove multiple fields that exist (nested)"), + pytest.param( + {".": {"k1": [0, 1]}}, [[".", "k1", 0]], {".": {"k1": [None, 1]}}, id="remove multiple fields that exist in arrays (nested)" + ), + pytest.param( + {".": {"k1": [{"k2": "v", "k3": "v"}, {"k4": "v"}]}}, + [[".", "k1", 0, "k2"], [".", "k1", 1, "k4"]], + {".": {"k1": [{"k3": "v"}, {}]}}, + id="remove fields that exist in arrays (deeply nested)", + ), + ], +) +def test_remove_fields(input_record: Mapping[str, Any], field_pointers: List[FieldPointer], expected: Mapping[str, Any]): + transformation = RemoveFields(field_pointers=field_pointers, options={}) + assert transformation.transform(input_record) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/auth/test_auth.py b/airbyte-cdk/python/unit_tests/sources/streams/http/auth/test_auth.py index f5b52b03ee626..3016113533eb0 100644 --- a/airbyte-cdk/python/unit_tests/sources/streams/http/auth/test_auth.py +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/auth/test_auth.py @@ -1,11 +1,17 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import logging -from airbyte_cdk.sources.streams.http.auth import MultipleTokenAuthenticator, NoAuth, Oauth2Authenticator, TokenAuthenticator +from airbyte_cdk.sources.streams.http.auth import ( + BasicHttpAuthenticator, + MultipleTokenAuthenticator, + NoAuth, + Oauth2Authenticator, + TokenAuthenticator, +) LOGGER = logging.getLogger(__name__) @@ -41,6 +47,12 @@ def test_no_auth(): assert {} == no_auth.get_auth_header() +def test_basic_authenticator(): + token = BasicHttpAuthenticator("client_id", "client_secret") + header = token.get_auth_header() + assert {"Authorization": "Basic Y2xpZW50X2lkOmNsaWVudF9zZWNyZXQ="} == header + + class TestOauth2Authenticator: """ Test class for OAuth2Authenticator. @@ -51,6 +63,7 @@ class TestOauth2Authenticator: client_secret = "client_secret" refresh_token = "refresh_token" refresh_access_token_headers = {"Header_1": "value 1", "Header_2": "value 2"} + refresh_access_token_authenticator = BasicHttpAuthenticator(client_id, client_secret) def test_get_auth_header_fresh(self, mocker): """ @@ -129,3 +142,14 @@ def test_refresh_access_token(self, requests_mock): assert header in mock_refresh_token_call.last_request.headers assert self.refresh_access_token_headers[header] == mock_refresh_token_call.last_request.headers[header] assert mock_refresh_token_call.called + + def test_refresh_access_authenticator(self): + oauth = Oauth2Authenticator( + TestOauth2Authenticator.refresh_endpoint, + TestOauth2Authenticator.client_id, + TestOauth2Authenticator.client_secret, + TestOauth2Authenticator.refresh_token, + refresh_access_token_authenticator=TestOauth2Authenticator.refresh_access_token_authenticator, + ) + expected_headers = {"Authorization": "Basic Y2xpZW50X2lkOmNsaWVudF9zZWNyZXQ="} + assert expected_headers == oauth.get_refresh_access_token_headers() diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py b/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py index 58d85736f6c29..36386d2143d66 100644 --- a/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py @@ -1,16 +1,23 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # - import logging +import pendulum import requests -from airbyte_cdk.sources.streams.http.requests_native_auth import MultipleTokenAuthenticator, Oauth2Authenticator, TokenAuthenticator +from airbyte_cdk.sources.streams.http.requests_native_auth import ( + BasicHttpAuthenticator, + MultipleTokenAuthenticator, + Oauth2Authenticator, + TokenAuthenticator, +) from requests import Response LOGGER = logging.getLogger(__name__) +resp = Response() + def test_token_authenticator(): """ @@ -29,6 +36,23 @@ def test_token_authenticator(): assert {"Authorization": "Bearer test-token"} == header2 +def test_basic_http_authenticator(): + """ + Should match passed in token, no matter how many times token is retrieved. + """ + token_auth = BasicHttpAuthenticator(username="user", password="password") + header1 = token_auth.get_auth_header() + header2 = token_auth.get_auth_header() + + prepared_request = requests.PreparedRequest() + prepared_request.headers = {} + token_auth(prepared_request) + + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == prepared_request.headers + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == header1 + assert {"Authorization": "Basic dXNlcjpwYXNzd29yZA=="} == header2 + + def test_multiple_token_authenticator(): multiple_token_auth = MultipleTokenAuthenticator(tokens=["token1", "token2"]) header1 = multiple_token_auth.get_auth_header() @@ -96,34 +120,40 @@ def test_refresh_request_body(self): """ scopes = ["scope1", "scope2"] oauth = Oauth2Authenticator( - token_refresh_endpoint=TestOauth2Authenticator.refresh_endpoint, - client_id=TestOauth2Authenticator.client_id, - client_secret=TestOauth2Authenticator.client_secret, - refresh_token=TestOauth2Authenticator.refresh_token, - scopes=scopes, + token_refresh_endpoint="refresh_end", + client_id="some_client_id", + client_secret="some_client_secret", + refresh_token="some_refresh_token", + scopes=["scope1", "scope2"], + token_expiry_date=pendulum.now().add(days=3), + refresh_request_body={"custom_field": "in_outbound_request", "another_field": "exists_in_body", "scopes": ["no_override"]}, ) - body = oauth.get_refresh_request_body() + body = oauth.build_refresh_request_body() expected = { "grant_type": "refresh_token", - "client_id": "client_id", - "client_secret": "client_secret", - "refresh_token": "refresh_token", + "client_id": "some_client_id", + "client_secret": "some_client_secret", + "refresh_token": "some_refresh_token", "scopes": scopes, + "custom_field": "in_outbound_request", + "another_field": "exists_in_body", } assert body == expected def test_refresh_access_token(self, mocker): oauth = Oauth2Authenticator( - token_refresh_endpoint=TestOauth2Authenticator.refresh_endpoint, - client_id=TestOauth2Authenticator.client_id, - client_secret=TestOauth2Authenticator.client_secret, - refresh_token=TestOauth2Authenticator.refresh_token, + token_refresh_endpoint="refresh_end", + client_id="some_client_id", + client_secret="some_client_secret", + refresh_token="some_refresh_token", + scopes=["scope1", "scope2"], + token_expiry_date=pendulum.now().add(days=3), + refresh_request_body={"custom_field": "in_outbound_request", "another_field": "exists_in_body", "scopes": ["no_override"]}, ) - resp = Response() - resp.status_code = 200 - mocker.patch.object(requests, "request", return_value=resp) + resp.status_code = 200 mocker.patch.object(resp, "json", return_value={"access_token": "access_token", "expires_in": 1000}) + mocker.patch.object(requests, "request", side_effect=mock_request, autospec=True) token = oauth.refresh_access_token() assert ("access_token", 1000) == token @@ -142,3 +172,9 @@ def test_auth_call_method(self, mocker): oauth(prepared_request) assert {"Authorization": "Bearer access_token"} == prepared_request.headers + + +def mock_request(method, url, data): + if url == "refresh_end": + return resp + raise Exception(f"Error while refreshing access token with request: {method}, {url}, {data}") diff --git a/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py index 61d8e0c69d596..86df87d48707e 100644 --- a/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py +++ b/airbyte-cdk/python/unit_tests/sources/streams/http/test_http.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -167,8 +167,10 @@ def max_retries(self): req.status_code = HTTPStatus.TOO_MANY_REQUESTS send_mock = mocker.patch.object(requests.Session, "send", return_value=req) - with pytest.raises(UserDefinedBackoffException): + with pytest.raises(UserDefinedBackoffException, match="Request URL: https://test_base_url.com/, Response Code: 429") as excinfo: list(stream.read_records(SyncMode.full_refresh)) + assert isinstance(excinfo.value.request, requests.PreparedRequest) + assert isinstance(excinfo.value.response, requests.Response) if retries <= 0: assert send_mock.call_count == 1 else: @@ -219,7 +221,7 @@ def test_raise_on_http_errors_off_429(mocker): req.status_code = 429 mocker.patch.object(requests.Session, "send", return_value=req) - with pytest.raises(DefaultBackoffException): + with pytest.raises(DefaultBackoffException, match="Request URL: https://test_base_url.com/, Response Code: 429"): list(stream.read_records(SyncMode.full_refresh)) @@ -238,28 +240,31 @@ def test_raise_on_http_errors_off_5xx(mocker, status_code): @pytest.mark.parametrize("status_code", [400, 401, 402, 403, 416]) def test_raise_on_http_errors_off_non_retryable_4xx(mocker, status_code): stream = AutoFailFalseHttpStream() - req = requests.Response() - req.status_code = status_code + req = requests.PreparedRequest() + res = requests.Response() + res.status_code = status_code - mocker.patch.object(requests.Session, "send", return_value=req) + mocker.patch.object(requests.Session, "send", return_value=res) response = stream._send_request(req, {}) assert response.status_code == status_code -def test_raise_on_http_errors_off_timeout(requests_mock): - stream = AutoFailFalseHttpStream() - requests_mock.register_uri("GET", stream.url_base, exc=requests.exceptions.ConnectTimeout) - - with pytest.raises(requests.exceptions.ConnectTimeout): - list(stream.read_records(SyncMode.full_refresh)) - - -def test_raise_on_http_errors_off_connection_error(requests_mock): +@pytest.mark.parametrize( + "error", + ( + requests.exceptions.ConnectTimeout, + requests.exceptions.ConnectionError, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ReadTimeout, + ), +) +def test_raise_on_http_errors(mocker, error): stream = AutoFailFalseHttpStream() - requests_mock.register_uri("GET", stream.url_base, exc=requests.exceptions.ConnectionError) + send_mock = mocker.patch.object(requests.Session, "send", side_effect=error()) - with pytest.raises(requests.exceptions.ConnectionError): + with pytest.raises(error): list(stream.read_records(SyncMode.full_refresh)) + assert send_mock.call_count == stream.max_retries + 1 class PostHttpStream(StubBasicReadHttpStream): @@ -324,13 +329,13 @@ def test_text_json_body(self, mocker, requests_mock): list(stream.read_records(sync_mode=SyncMode.full_refresh)) def test_body_for_all_methods(self, mocker, requests_mock): - """Stream must send a body for POST/PATCH/PUT methods only""" + """Stream must send a body for GET/POST/PATCH/PUT methods only""" stream = PostHttpStream() methods = { "POST": True, "PUT": True, "PATCH": True, - "GET": False, + "GET": True, "DELETE": False, "OPTIONS": False, } @@ -422,3 +427,72 @@ def test_using_cache(mocker): pass assert parent_stream.cassete.play_count != 0 + + +class AutoFailTrueHttpStream(StubBasicReadHttpStream): + raise_on_http_errors = True + + +@pytest.mark.parametrize("status_code", range(400, 600)) +def test_send_raise_on_http_errors_logs(mocker, status_code): + mocker.patch.object(AutoFailTrueHttpStream, "logger") + mocker.patch.object(AutoFailTrueHttpStream, "should_retry", mocker.Mock(return_value=False)) + stream = AutoFailTrueHttpStream() + req = requests.PreparedRequest() + res = requests.Response() + res.status_code = status_code + mocker.patch.object(requests.Session, "send", return_value=res) + with pytest.raises(requests.exceptions.HTTPError): + response = stream._send_request(req, {}) + stream.logger.error.assert_called_with(response.text) + assert response.status_code == status_code + + +@pytest.mark.parametrize( + "api_response, expected_message", + [ + ({"error": "something broke"}, "something broke"), + ({"error": {"message": "something broke"}}, "something broke"), + ({"error": "err-001", "message": "something broke"}, "something broke"), + ({"failure": {"message": "something broke"}}, "something broke"), + ({"error": {"errors": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}}, "one, two, three"), + ({"errors": ["one", "two", "three"]}, "one, two, three"), + ({"messages": ["one", "two", "three"]}, "one, two, three"), + ({"errors": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + ({"error": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + ({"errors": [{"error": "one"}, {"error": "two"}, {"error": "three"}]}, "one, two, three"), + ({"failures": [{"message": "one"}, {"message": "two"}, {"message": "three"}]}, "one, two, three"), + (["one", "two", "three"], "one, two, three"), + ([{"error": "one"}, {"error": "two"}, {"error": "three"}], "one, two, three"), + ({"error": True}, None), + ({"something_else": "hi"}, None), + ({}, None), + ], +) +def test_default_parse_response_error_message(api_response: dict, expected_message: Optional[str]): + stream = StubBasicReadHttpStream() + response = MagicMock() + response.json.return_value = api_response + + message = stream.parse_response_error_message(response) + assert message == expected_message + + +def test_default_parse_response_error_message_not_json(requests_mock): + stream = StubBasicReadHttpStream() + requests_mock.register_uri("GET", "mock://test.com/not_json", text="this is not json") + response = requests.get("mock://test.com/not_json") + + message = stream.parse_response_error_message(response) + assert message is None + + +def test_default_get_error_display_message_handles_http_error(mocker): + stream = StubBasicReadHttpStream() + mocker.patch.object(stream, "parse_response_error_message", return_value="my custom message") + + non_http_err_msg = stream.get_error_display_message(RuntimeError("not me")) + assert non_http_err_msg is None + + http_err_msg = stream.get_error_display_message(requests.HTTPError()) + assert http_err_msg == "my custom message" diff --git a/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py b/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py index c073bd84e0565..82fe96d412c23 100644 --- a/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py +++ b/airbyte-cdk/python/unit_tests/sources/streams/test_streams_core.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -57,6 +57,26 @@ def read_records( cursor_field = "test_cursor" primary_key = "primary_key" + namespace = "test_namespace" + + +class StreamStubIncrementalEmptyNamespace(Stream): + """ + Stub full incremental class, with empty namespace, to assist with testing. + """ + + def read_records( + self, + sync_mode: SyncMode, + cursor_field: List[str] = None, + stream_slice: Mapping[str, Any] = None, + stream_state: Mapping[str, Any] = None, + ) -> Iterable[Mapping[str, Any]]: + pass + + cursor_field = "test_cursor" + primary_key = "primary_key" + namespace = "" def test_as_airbyte_stream_incremental(mocker): @@ -71,6 +91,7 @@ def test_as_airbyte_stream_incremental(mocker): exp = AirbyteStream( name="stream_stub_incremental", + namespace="test_namespace", json_schema={}, supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], default_cursor_field=["test_cursor"], @@ -99,6 +120,47 @@ def test_supports_incremental_cursor_not_set(): assert not test_stream.supports_incremental +def test_namespace_set(): + """ + Should allow namespace property to be set. + """ + test_stream = StreamStubIncremental() + + assert test_stream.namespace == "test_namespace" + + +def test_namespace_set_to_empty_string(mocker): + """ + Should not set namespace property if equal to empty string. + """ + test_stream = StreamStubIncremental() + + mocker.patch.object(StreamStubIncremental, "get_json_schema", return_value={}) + mocker.patch.object(StreamStubIncremental, "namespace", "") + + airbyte_stream = test_stream.as_airbyte_stream() + + exp = AirbyteStream( + name="stream_stub_incremental", + json_schema={}, + supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], + default_cursor_field=["test_cursor"], + source_defined_cursor=True, + source_defined_primary_key=[["primary_key"]], + namespace=None, + ) + assert exp == airbyte_stream + + +def test_namespace_not_set(): + """ + Should be equal to unset value of None. + """ + test_stream = StreamStubFullRefresh() + + assert test_stream.namespace is None + + @pytest.mark.parametrize( "test_input, expected", [("key", [["key"]]), (["key1", "key2"], [["key1"], ["key2"]]), ([["key1", "key2"], ["key3"]], [["key1", "key2"], ["key3"]])], diff --git a/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py b/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py index 703504598c61b..7bae0e7a163b1 100644 --- a/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py +++ b/airbyte-cdk/python/unit_tests/sources/test_abstract_source.py @@ -1,10 +1,11 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import logging from collections import defaultdict from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from unittest.mock import call import pytest from airbyte_cdk.models import ( @@ -23,12 +24,17 @@ ) from airbyte_cdk.sources import AbstractSource from airbyte_cdk.sources.streams import Stream +from airbyte_cdk.utils.traced_exception import AirbyteTracedException logger = logging.getLogger("airbyte") class MockSource(AbstractSource): - def __init__(self, check_lambda: Callable[[], Tuple[bool, Optional[Any]]] = None, streams: List[Stream] = None): + def __init__( + self, + check_lambda: Callable[[], Tuple[bool, Optional[Any]]] = None, + streams: List[Stream] = None, + ): self._streams = streams self.check_lambda = check_lambda @@ -56,13 +62,17 @@ def test_failed_check(): def test_raising_check(): - """Tests that if a source raises an unexpected exception the connection check the appropriate connectionStatus failure message is returned""" + """Tests that if a source raises an unexpected exception the appropriate connectionStatus failure message is returned.""" expected = AirbyteConnectionStatus(status=Status.FAILED, message="Exception('this should fail')") assert expected == MockSource(check_lambda=lambda: exec('raise Exception("this should fail")')).check(logger, {}) class MockStream(Stream): - def __init__(self, inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]] = None, name: str = None): + def __init__( + self, + inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]] = None, + name: str = None, + ): self._inputs_and_mocked_outputs = inputs_and_mocked_outputs self._name = name @@ -85,6 +95,22 @@ def primary_key(self) -> Optional[Union[str, List[str], List[List[str]]]]: return "pk" +class MockStreamWithState(MockStream): + cursor_field = "cursor" + + def __init__(self, inputs_and_mocked_outputs: List[Tuple[Mapping[str, Any], Iterable[Mapping[str, Any]]]], name: str, state=None): + super().__init__(inputs_and_mocked_outputs, name) + self._state = state + + @property + def state(self): + return self._state + + @state.setter + def state(self, value): + pass + + def test_discover(mocker): """Tests that the appropriate AirbyteCatalog is returned from the discover method""" airbyte_stream1 = AirbyteStream( @@ -121,11 +147,34 @@ def test_read_nonexistent_stream_raises_exception(mocker): list(src.read(logger, {}, catalog)) +def test_read_stream_with_error_gets_display_message(mocker): + stream = MockStream(name="my_stream") + + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "read_records", side_effect=RuntimeError("oh no!")) + + source = MockSource(streams=[stream]) + catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(stream, SyncMode.full_refresh)]) + + # without get_error_display_message + with pytest.raises(RuntimeError, match="oh no!"): + list(source.read(logger, {}, catalog)) + + mocker.patch.object(MockStream, "get_error_display_message", return_value="my message") + + with pytest.raises(AirbyteTracedException, match="oh no!") as exc: + list(source.read(logger, {}, catalog)) + assert exc.value.message == "my message" + + GLOBAL_EMITTED_AT = 1 def _as_record(stream: str, data: Dict[str, Any]) -> AirbyteMessage: - return AirbyteMessage(type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=GLOBAL_EMITTED_AT)) + return AirbyteMessage( + type=Type.RECORD, + record=AirbyteRecordMessage(stream=stream, data=data, emitted_at=GLOBAL_EMITTED_AT), + ) def _as_records(stream: str, data: List[Dict[str, Any]]) -> List[AirbyteMessage]: @@ -134,7 +183,9 @@ def _as_records(stream: str, data: List[Dict[str, Any]]) -> List[AirbyteMessage] def _configured_stream(stream: Stream, sync_mode: SyncMode): return ConfiguredAirbyteStream( - stream=stream.as_airbyte_stream(), sync_mode=sync_mode, destination_sync_mode=DestinationSyncMode.overwrite + stream=stream.as_airbyte_stream(), + sync_mode=sync_mode, + destination_sync_mode=DestinationSyncMode.overwrite, ) @@ -155,7 +206,10 @@ def test_valid_full_refresh_read_no_slices(mocker): src = MockSource(streams=[s1, s2]) catalog = ConfiguredAirbyteCatalog( - streams=[_configured_stream(s1, SyncMode.full_refresh), _configured_stream(s2, SyncMode.full_refresh)] + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + ] ) expected = _as_records("s1", stream_output) + _as_records("s2", stream_output) @@ -168,15 +222,24 @@ def test_valid_full_refresh_read_with_slices(mocker): """Tests that running a full refresh sync on streams which use slices produces the expected AirbyteMessages""" slices = [{"1": "1"}, {"2": "2"}] # When attempting to sync a slice, just output that slice as a record - s1 = MockStream([({"sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], name="s1") - s2 = MockStream([({"sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], name="s2") + s1 = MockStream( + [({"sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], + name="s1", + ) + s2 = MockStream( + [({"sync_mode": SyncMode.full_refresh, "stream_slice": s}, [s]) for s in slices], + name="s2", + ) mocker.patch.object(MockStream, "get_json_schema", return_value={}) mocker.patch.object(MockStream, "stream_slices", return_value=slices) src = MockSource(streams=[s1, s2]) catalog = ConfiguredAirbyteCatalog( - streams=[_configured_stream(s1, SyncMode.full_refresh), _configured_stream(s2, SyncMode.full_refresh)] + streams=[ + _configured_stream(s1, SyncMode.full_refresh), + _configured_stream(s2, SyncMode.full_refresh), + ] ) expected = [*_as_records("s1", slices), *_as_records("s2", slices)] @@ -190,155 +253,361 @@ def _state(state_data: Dict[str, Any]): return AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage(data=state_data)) -def test_valid_incremental_read_with_checkpoint_interval(mocker): - """Tests that an incremental read which doesn't specify a checkpoint interval outputs a STATE message after reading N records within a stream""" - stream_output = [{"k1": "v1"}, {"k2": "v2"}] - s1 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s1") - s2 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s2") - state = {"cursor": "value"} - mocker.patch.object(MockStream, "get_updated_state", return_value=state) - mocker.patch.object(MockStream, "supports_incremental", return_value=True) - mocker.patch.object(MockStream, "get_json_schema", return_value={}) - # Tell the source to output one state message per record - mocker.patch.object(MockStream, "state_checkpoint_interval", new_callable=mocker.PropertyMock, return_value=1) - - src = MockSource(streams=[s1, s2]) - catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s1, SyncMode.incremental), _configured_stream(s2, SyncMode.incremental)]) - - expected = [ - _as_record("s1", stream_output[0]), - _state({"s1": state}), - _as_record("s1", stream_output[1]), - _state({"s1": state}), - _state({"s1": state}), - _as_record("s2", stream_output[0]), - _state({"s1": state, "s2": state}), - _as_record("s2", stream_output[1]), - _state({"s1": state, "s2": state}), - _state({"s1": state, "s2": state}), - ] - messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) - - assert expected == messages - - -def test_valid_incremental_read_with_no_interval(mocker): - """Tests that an incremental read which doesn't specify a checkpoint interval outputs a STATE message only after fully reading the stream and does - not output any STATE messages during syncing the stream.""" - stream_output = [{"k1": "v1"}, {"k2": "v2"}] - s1 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s1") - s2 = MockStream([({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], name="s2") - state = {"cursor": "value"} - mocker.patch.object(MockStream, "get_updated_state", return_value=state) - mocker.patch.object(MockStream, "supports_incremental", return_value=True) - mocker.patch.object(MockStream, "get_json_schema", return_value={}) - - src = MockSource(streams=[s1, s2]) - catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s1, SyncMode.incremental), _configured_stream(s2, SyncMode.incremental)]) - - expected = [ - *_as_records("s1", stream_output), - _state({"s1": state}), - *_as_records("s2", stream_output), - _state({"s1": state, "s2": state}), - ] - - messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) - - assert expected == messages - - -def test_valid_incremental_read_with_slices(mocker): - """Tests that an incremental read which uses slices outputs each record in the slice followed by a STATE message, for each slice""" - slices = [{"1": "1"}, {"2": "2"}] - stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] - s1 = MockStream( - [({"sync_mode": SyncMode.incremental, "stream_slice": s, "stream_state": mocker.ANY}, stream_output) for s in slices], name="s1" - ) - s2 = MockStream( - [({"sync_mode": SyncMode.incremental, "stream_slice": s, "stream_state": mocker.ANY}, stream_output) for s in slices], name="s2" - ) - state = {"cursor": "value"} - mocker.patch.object(MockStream, "get_updated_state", return_value=state) - mocker.patch.object(MockStream, "supports_incremental", return_value=True) - mocker.patch.object(MockStream, "get_json_schema", return_value={}) - mocker.patch.object(MockStream, "stream_slices", return_value=slices) - - src = MockSource(streams=[s1, s2]) - catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s1, SyncMode.incremental), _configured_stream(s2, SyncMode.incremental)]) - - expected = [ - # stream 1 slice 1 - *_as_records("s1", stream_output), - _state({"s1": state}), - # stream 1 slice 2 - *_as_records("s1", stream_output), - _state({"s1": state}), - # stream 2 slice 1 - *_as_records("s2", stream_output), - _state({"s1": state, "s2": state}), - # stream 2 slice 2 - *_as_records("s2", stream_output), - _state({"s1": state, "s2": state}), - ] - - messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) - - assert expected == messages - - -def test_valid_incremental_read_with_slices_and_interval(mocker): - """ - Tests that an incremental read which uses slices and a checkpoint interval: - 1. outputs all records - 2. outputs a state message every N records (N=checkpoint_interval) - 3. outputs a state message after reading the entire slice - """ - slices = [{"1": "1"}, {"2": "2"}] - stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] - s1 = MockStream( - [({"sync_mode": SyncMode.incremental, "stream_slice": s, "stream_state": mocker.ANY}, stream_output) for s in slices], name="s1" - ) - s2 = MockStream( - [({"sync_mode": SyncMode.incremental, "stream_slice": s, "stream_state": mocker.ANY}, stream_output) for s in slices], name="s2" - ) - state = {"cursor": "value"} - mocker.patch.object(MockStream, "get_updated_state", return_value=state) - mocker.patch.object(MockStream, "supports_incremental", return_value=True) - mocker.patch.object(MockStream, "get_json_schema", return_value={}) - mocker.patch.object(MockStream, "stream_slices", return_value=slices) - mocker.patch.object(MockStream, "state_checkpoint_interval", new_callable=mocker.PropertyMock, return_value=2) - - src = MockSource(streams=[s1, s2]) - catalog = ConfiguredAirbyteCatalog(streams=[_configured_stream(s1, SyncMode.incremental), _configured_stream(s2, SyncMode.incremental)]) - - expected = [ - # stream 1 slice 1 - _as_record("s1", stream_output[0]), - _as_record("s1", stream_output[1]), - _state({"s1": state}), - _as_record("s1", stream_output[2]), - _state({"s1": state}), - # stream 1 slice 2 - _as_record("s1", stream_output[0]), - _as_record("s1", stream_output[1]), - _state({"s1": state}), - _as_record("s1", stream_output[2]), - _state({"s1": state}), - # stream 2 slice 1 - _as_record("s2", stream_output[0]), - _as_record("s2", stream_output[1]), - _state({"s1": state, "s2": state}), - _as_record("s2", stream_output[2]), - _state({"s1": state, "s2": state}), - # stream 2 slice 2 - _as_record("s2", stream_output[0]), - _as_record("s2", stream_output[1]), - _state({"s1": state, "s2": state}), - _as_record("s2", stream_output[2]), - _state({"s1": state, "s2": state}), - ] - - messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) - - assert expected == messages +class TestIncrementalRead: + def test_with_state_attribute(self, mocker): + """Test correct state passing for the streams that have a state attribute""" + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + old_state = {"cursor": "old_value"} + new_state = {"cursor": "new_value"} + s1 = MockStreamWithState( + [ + ( + {"sync_mode": SyncMode.incremental, "stream_state": old_state}, + stream_output, + ) + ], + name="s1", + ) + s2 = MockStreamWithState( + [({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], + name="s2", + ) + mocker.patch.object(MockStreamWithState, "get_updated_state", return_value={}) + state_property = mocker.patch.object( + MockStreamWithState, + "state", + new_callable=mocker.PropertyMock, + return_value=new_state, + ) + mocker.patch.object(MockStreamWithState, "get_json_schema", return_value={}) + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + _as_record("s1", stream_output[0]), + _as_record("s1", stream_output[1]), + _state({"s1": new_state}), + _as_record("s2", stream_output[0]), + _as_record("s2", stream_output[1]), + _state({"s1": new_state, "s2": new_state}), + ] + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state={"s1": old_state}))) + + assert expected == messages + assert state_property.mock_calls == [ + call(old_state), # set state for s1 + call(), # get state in the end of slice for s1 + call(), # get state in the end of slice for s2 + ] + + def test_with_checkpoint_interval(self, mocker): + """Tests that an incremental read which doesn't specify a checkpoint interval outputs a STATE message + after reading N records within a stream. + """ + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], + name="s1", + ) + s2 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + # Tell the source to output one state message per record + mocker.patch.object( + MockStream, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=1, + ) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + _as_record("s1", stream_output[0]), + _state({"s1": state}), + _as_record("s1", stream_output[1]), + _state({"s1": state}), + _state({"s1": state}), + _as_record("s2", stream_output[0]), + _state({"s1": state, "s2": state}), + _as_record("s2", stream_output[1]), + _state({"s1": state, "s2": state}), + _state({"s1": state, "s2": state}), + ] + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) + + assert expected == messages + + def test_with_no_interval(self, mocker): + """Tests that an incremental read which doesn't specify a checkpoint interval outputs + a STATE message only after fully reading the stream and does not output any STATE messages during syncing the stream. + """ + stream_output = [{"k1": "v1"}, {"k2": "v2"}] + s1 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], + name="s1", + ) + s2 = MockStream( + [({"sync_mode": SyncMode.incremental, "stream_state": {}}, stream_output)], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + *_as_records("s1", stream_output), + _state({"s1": state}), + *_as_records("s2", stream_output), + _state({"s1": state, "s2": state}), + ] + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) + + assert expected == messages + + def test_with_slices(self, mocker): + """Tests that an incremental read which uses slices outputs each record in the slice followed by a STATE message, for each slice""" + slices = [{"1": "1"}, {"2": "2"}] + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + s1 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + ) + s2 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + # stream 1 slice 1 + *_as_records("s1", stream_output), + _state({"s1": state}), + # stream 1 slice 2 + *_as_records("s1", stream_output), + _state({"s1": state}), + # stream 2 slice 1 + *_as_records("s2", stream_output), + _state({"s1": state, "s2": state}), + # stream 2 slice 2 + *_as_records("s2", stream_output), + _state({"s1": state, "s2": state}), + ] + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) + + assert expected == messages + + def test_no_slices(self, mocker): + """ + Tests that an incremental read returns at least one state messages even if no records were read: + 1. outputs a state message after reading the entire stream + """ + slices = [] + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + state = {"cursor": "value"} + s1 = MockStreamWithState( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + state=state, + ) + s2 = MockStreamWithState( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + state=state, + ) + + mocker.patch.object(MockStreamWithState, "supports_incremental", return_value=True) + mocker.patch.object(MockStreamWithState, "get_json_schema", return_value={}) + mocker.patch.object(MockStreamWithState, "stream_slices", return_value=slices) + mocker.patch.object( + MockStreamWithState, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=2, + ) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + _state({"s1": state}), + _state({"s1": state, "s2": state}), + ] + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) + + print(f"expected:\n{expected}") + print(f"messages:\n{messages}") + assert expected == messages + + def test_with_slices_and_interval(self, mocker): + """ + Tests that an incremental read which uses slices and a checkpoint interval: + 1. outputs all records + 2. outputs a state message every N records (N=checkpoint_interval) + 3. outputs a state message after reading the entire slice + """ + slices = [{"1": "1"}, {"2": "2"}] + stream_output = [{"k1": "v1"}, {"k2": "v2"}, {"k3": "v3"}] + s1 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s1", + ) + s2 = MockStream( + [ + ( + { + "sync_mode": SyncMode.incremental, + "stream_slice": s, + "stream_state": mocker.ANY, + }, + stream_output, + ) + for s in slices + ], + name="s2", + ) + state = {"cursor": "value"} + mocker.patch.object(MockStream, "get_updated_state", return_value=state) + mocker.patch.object(MockStream, "supports_incremental", return_value=True) + mocker.patch.object(MockStream, "get_json_schema", return_value={}) + mocker.patch.object(MockStream, "stream_slices", return_value=slices) + mocker.patch.object( + MockStream, + "state_checkpoint_interval", + new_callable=mocker.PropertyMock, + return_value=2, + ) + + src = MockSource(streams=[s1, s2]) + catalog = ConfiguredAirbyteCatalog( + streams=[ + _configured_stream(s1, SyncMode.incremental), + _configured_stream(s2, SyncMode.incremental), + ] + ) + + expected = [ + # stream 1 slice 1 + _as_record("s1", stream_output[0]), + _as_record("s1", stream_output[1]), + _state({"s1": state}), + _as_record("s1", stream_output[2]), + _state({"s1": state}), + # stream 1 slice 2 + _as_record("s1", stream_output[0]), + _as_record("s1", stream_output[1]), + _state({"s1": state}), + _as_record("s1", stream_output[2]), + _state({"s1": state}), + # stream 2 slice 1 + _as_record("s2", stream_output[0]), + _as_record("s2", stream_output[1]), + _state({"s1": state, "s2": state}), + _as_record("s2", stream_output[2]), + _state({"s1": state, "s2": state}), + # stream 2 slice 2 + _as_record("s2", stream_output[0]), + _as_record("s2", stream_output[1]), + _state({"s1": state, "s2": state}), + _as_record("s2", stream_output[2]), + _state({"s1": state, "s2": state}), + ] + + messages = _fix_emitted_at(list(src.read(logger, {}, catalog, state=defaultdict(dict)))) + + assert expected == messages diff --git a/airbyte-cdk/python/unit_tests/sources/test_config.py b/airbyte-cdk/python/unit_tests/sources/test_config.py index 909ed681d0885..b76be571f93df 100644 --- a/airbyte-cdk/python/unit_tests/sources/test_config.py +++ b/airbyte-cdk/python/unit_tests/sources/test_config.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # from typing import List, Union diff --git a/airbyte-cdk/python/unit_tests/sources/test_source.py b/airbyte-cdk/python/unit_tests/sources/test_source.py index 141168494c236..de2b282012eef 100644 --- a/airbyte-cdk/python/unit_tests/sources/test_source.py +++ b/airbyte-cdk/python/unit_tests/sources/test_source.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -135,7 +135,8 @@ def test_internal_config(abstract_source, catalog): non_http_stream.read_records.return_value = [{}] * 3 # Test with empty config - records = [r for r in abstract_source.read(logger=MagicMock(), config={}, catalog=catalog, state={})] + logger = logging.getLogger(f"airbyte.{getattr(abstract_source, 'name', '')}") + records = [r for r in abstract_source.read(logger=logger, config={}, catalog=catalog, state={})] # 3 for http stream and 3 for non http stream assert len(records) == 3 + 3 assert http_stream.read_records.called @@ -145,19 +146,19 @@ def test_internal_config(abstract_source, catalog): assert not non_http_stream.page_size # Test with records limit set to 1 internal_config = {"some_config": 100, "_limit": 1} - records = [r for r in abstract_source.read(logger=MagicMock(), config=internal_config, catalog=catalog, state={})] + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] # 1 from http stream + 1 from non http stream assert len(records) == 1 + 1 assert "_limit" not in abstract_source.streams_config assert "some_config" in abstract_source.streams_config # Test with records limit set to number that exceeds expceted records internal_config = {"some_config": 100, "_limit": 20} - records = [r for r in abstract_source.read(logger=MagicMock(), config=internal_config, catalog=catalog, state={})] + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] assert len(records) == 3 + 3 # Check if page_size paramter is set to http instance only internal_config = {"some_config": 100, "_page_size": 2} - records = [r for r in abstract_source.read(logger=MagicMock(), config=internal_config, catalog=catalog, state={})] + records = [r for r in abstract_source.read(logger=logger, config=internal_config, catalog=catalog, state={})] assert "_page_size" not in abstract_source.streams_config assert "some_config" in abstract_source.streams_config assert len(records) == 3 + 3 @@ -168,6 +169,7 @@ def test_internal_config(abstract_source, catalog): def test_internal_config_limit(abstract_source, catalog): logger_mock = MagicMock() + logger_mock.level = logging.DEBUG del catalog.streams[1] STREAM_LIMIT = 2 FULL_RECORDS_NUMBER = 3 @@ -205,6 +207,7 @@ def test_internal_config_limit(abstract_source, catalog): def test_source_config_no_transform(abstract_source, catalog): logger_mock = MagicMock() + logger_mock.level = logging.DEBUG streams = abstract_source.streams(None) http_stream, non_http_stream = streams http_stream.get_json_schema.return_value = non_http_stream.get_json_schema.return_value = SCHEMA @@ -218,6 +221,7 @@ def test_source_config_no_transform(abstract_source, catalog): def test_source_config_transform(abstract_source, catalog): logger_mock = MagicMock() + logger_mock.level = logging.DEBUG streams = abstract_source.streams(None) http_stream, non_http_stream = streams http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) @@ -231,6 +235,7 @@ def test_source_config_transform(abstract_source, catalog): def test_source_config_transform_and_no_transform(abstract_source, catalog): logger_mock = MagicMock() + logger_mock.level = logging.DEBUG streams = abstract_source.streams(None) http_stream, non_http_stream = streams http_stream.transformer = TypeTransformer(TransformConfig.DefaultSchemaNormalization) diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_catalog_helpers.py b/airbyte-cdk/python/unit_tests/sources/utils/test_catalog_helpers.py index 2c19726fcb253..8b82688aea620 100644 --- a/airbyte-cdk/python/unit_tests/sources/utils/test_catalog_helpers.py +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_catalog_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py index 3899141a21baf..55328fed0f2a4 100644 --- a/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_helpers.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -12,10 +12,9 @@ from pathlib import Path import jsonref -import pytest from airbyte_cdk.logger import AirbyteLogger from airbyte_cdk.models.airbyte_protocol import ConnectorSpecification -from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader, check_config_against_spec_or_exit, get_secret_values +from airbyte_cdk.sources.utils.schema_helpers import ResourceSchemaLoader, check_config_against_spec_or_exit from pytest import fixture from pytest import raises as pytest_raises @@ -186,69 +185,3 @@ def test_shared_schemas_resolves_nested(): # Make sure generated schema is JSON serializable assert json.dumps(actual_schema) assert jsonref.JsonRef.replace_refs(actual_schema) - - -@pytest.mark.parametrize( - "schema,config,expected", - [ - ( - { - "type": "object", - "properties": { - "credentials": { - "type": "object", - "oneOf": [ - { - "type": "object", - "properties": { - "option_title": { - "type": "string", - "const": "OAuth Credentials", - } - }, - }, - { - "type": "object", - "properties": { - "option_title": {"type": "string"}, - "personal_access_token": { - "type": "string", - "airbyte_secret": True, - }, - }, - }, - ], - }, - "repository": {"type": "string"}, - "start_date": {"type": "string"}, - }, - }, - {"credentials": {"personal_access_token": "secret"}}, - ["secret"], - ), - ( - { - "type": "object", - "properties": { - "access_token": {"type": "string", "airbyte_secret": True}, - "whatever": {"type": "string", "airbyte_secret": False}, - }, - }, - {"access_token": "secret"}, - ["secret"], - ), - ( - { - "type": "object", - "properties": { - "access_token": {"type": "string", "airbyte_secret": False}, - "whatever": {"type": "string", "airbyte_secret": False}, - }, - }, - {"access_token": "secret"}, - [], - ), - ], -) -def test_get_secret_values(schema, config, expected): - assert get_secret_values(schema, config) == expected diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_schema_models.py b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_models.py index ca2fec7213d45..b0419dd1b26f1 100644 --- a/airbyte-cdk/python/unit_tests/sources/utils/test_schema_models.py +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_schema_models.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # from typing import List, Optional diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_sentry.py b/airbyte-cdk/python/unit_tests/sources/utils/test_sentry.py deleted file mode 100644 index fd01cc6536154..0000000000000 --- a/airbyte-cdk/python/unit_tests/sources/utils/test_sentry.py +++ /dev/null @@ -1,125 +0,0 @@ -# -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. -# - -import json -import os -from dataclasses import dataclass -from logging import getLogger -from typing import List -from unittest import mock - -import requests -from airbyte_cdk.sources.utils.sentry import AirbyteSentry -from sentry_sdk.transport import Transport - - -@mock.patch("airbyte_cdk.sources.utils.sentry.sentry_sdk") -def test_sentry_init_no_env(sentry_mock): - assert AirbyteSentry.DSN_ENV_NAME not in os.environ - AirbyteSentry.init("test_source") - assert not sentry_mock.init.called - assert not AirbyteSentry.sentry_enabled - AirbyteSentry.set_tag("tagname", "value") - assert not sentry_mock.set_tag.called - AirbyteSentry.add_breadcrumb("msg", data={}) - assert not sentry_mock.add_breadcrumb.called - - with AirbyteSentry.start_transaction("name", "op"): - assert not sentry_mock.start_transaction.called - - with AirbyteSentry.start_transaction_span("name", "op"): - assert not sentry_mock.start_span.called - - -@mock.patch.dict(os.environ, {AirbyteSentry.DSN_ENV_NAME: "dsn"}) -@mock.patch("airbyte_cdk.sources.utils.sentry.sentry_sdk") -def test_sentry_init(sentry_mock): - AirbyteSentry.init("test_source") - assert sentry_mock.init.called - sentry_mock.set_tag.assert_any_call("source", "test_source") - sentry_mock.set_tag.assert_any_call("run_id", mock.ANY) - assert AirbyteSentry.sentry_enabled - AirbyteSentry.set_tag("tagname", "value") - assert sentry_mock.set_tag.called - AirbyteSentry.add_breadcrumb("msg", data={}) - assert sentry_mock.add_breadcrumb.called - with AirbyteSentry.start_transaction("name", "op"): - assert sentry_mock.start_transaction.called - - with AirbyteSentry.start_transaction_span("name", "op"): - assert sentry_mock.start_span.called - - -@dataclass -class TestTransport(Transport): - secrets: List[str] - # Sentry sdk wraps sending event with try except that would intercept - # AssertionError exception resulting it would ignore assert directive. - # Use this variable to check if test failed after sentry code executed. - failed = None - - def capture_envelope(self, envelop): - for s in self.secrets: - for i in envelop.items: - payload = json.dumps(i.payload.json) - assert s not in payload - - def capture_event(self, event): - if self.failed: - return - event = json.dumps(event) - for s in self.secrets: - if s in event: - self.failed = f"{s} should not be in {event}" - return - - -@mock.patch.dict(os.environ, {AirbyteSentry.DSN_ENV_NAME: "https://22222@222.ingest.sentry.io/111"}) -def test_sentry_sensitive_info(httpserver): - SECRET = "SOME_secret" - UNEXPECTED_SECRET = "UnexEpectedSecret" - SECRETS = [SECRET] - transport = TestTransport(secrets=[*SECRETS, UNEXPECTED_SECRET]) - - AirbyteSentry.init("test_source", transport=transport, secret_values=SECRETS) - - AirbyteSentry.add_breadcrumb("msg", {"crumb": SECRET}) - AirbyteSentry.set_context("my secret", {"api_key": SECRET}) - AirbyteSentry.capture_message(f"this is {SECRET}") - AirbyteSentry.capture_message(f"Issue url http://localhost:{httpserver.port}/test?api_key={UNEXPECTED_SECRET}") - AirbyteSentry.capture_message(f"Issue url http://localhost:{httpserver.port}/test?access_token={UNEXPECTED_SECRET}") - AirbyteSentry.capture_message(f"Issue url http://localhost:{httpserver.port}/test?refresh_token={UNEXPECTED_SECRET}") - AirbyteSentry.set_context("headers", {"Authorization": f"Bearer {UNEXPECTED_SECRET}"}) - getLogger("airbyte").info(f"this is {SECRET}") - requests.get( - f"http://localhost:{httpserver.port}/test?api_key={SECRET}", - headers={"Authorization": f"Bearer {SECRET}"}, - ).text - requests.get( - f"http://localhost:{httpserver.port}/test?api_key={UNEXPECTED_SECRET}", - headers={"Authorization": f"Bearer {UNEXPECTED_SECRET}"}, - ).text - AirbyteSentry.capture_exception(Exception(f"Secret info: {SECRET}")) - assert not transport.failed - - -@mock.patch.dict(os.environ, {AirbyteSentry.DSN_ENV_NAME: "https://22222@222.ingest.sentry.io/111"}) -def test_sentry_sensitive_info_transactions(httpserver): - SECRET = "SOME_secret" - SECRETS = [SECRET] - UNEXPECTED_SECRET = "UnexEpectedSecret" - transport = TestTransport(secrets=[*SECRETS, UNEXPECTED_SECRET]) - AirbyteSentry.init("test_source", transport=transport, secret_values=SECRETS) - - AirbyteSentry.set_context("my secret", {"api_key": SECRET}) - AirbyteSentry.set_context("headers", {"Authorization": f"Bearer {UNEXPECTED_SECRET}"}) - with AirbyteSentry.start_transaction("name", "op"): - with AirbyteSentry.start_transaction_span( - "name", description=f"http://localhost:{httpserver.port}/test?api_key={UNEXPECTED_SECRET}" - ): - requests.get( - f"http://localhost:{httpserver.port}/test?api_key={SECRET}", - headers={"Authorization": f"Bearer {SECRET}"}, - ).text - assert not transport.failed diff --git a/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py b/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py index e801e475b92a8..d61a6dfa17dcb 100644 --- a/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py +++ b/airbyte-cdk/python/unit_tests/sources/utils/test_transform.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import json @@ -151,15 +151,17 @@ ), ], ) -def test_transform(schema, actual, expected, expected_warns, capsys): +def test_transform(schema, actual, expected, expected_warns, caplog): t = TypeTransformer(TransformConfig.DefaultSchemaNormalization) t.transform(actual, schema) assert json.dumps(actual) == json.dumps(expected) - stdout = capsys.readouterr().out if expected_warns: - assert expected_warns in stdout + record = caplog.records[0] + assert record.name == "airbyte" + assert record.levelname == "WARNING" + assert record.message == expected_warns else: - assert not stdout + assert len(caplog.records) == 0 def test_transform_wrong_config(): diff --git a/airbyte-cdk/python/unit_tests/test_connector.py b/airbyte-cdk/python/unit_tests/test_connector.py index 463b58290cba8..06e9dd16ead15 100644 --- a/airbyte-cdk/python/unit_tests/test_connector.py +++ b/airbyte-cdk/python/unit_tests/test_connector.py @@ -1,18 +1,27 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import json import logging +import os +import sys import tempfile from pathlib import Path from typing import Any, Mapping import pytest +import yaml from airbyte_cdk import AirbyteSpec, Connector from airbyte_cdk.models import AirbyteConnectionStatus +logger = logging.getLogger("airbyte") + +MODULE = sys.modules[__name__] +MODULE_PATH = os.path.abspath(MODULE.__file__) +SPEC_ROOT = os.path.dirname(MODULE_PATH) + class TestAirbyteSpec: VALID_SPEC = { @@ -71,3 +80,53 @@ def test_write_config(integration, mock_config): integration.write_config(mock_config, str(config_path)) with open(config_path, "r") as actual: assert mock_config == json.loads(actual.read()) + + +class TestConnectorSpec: + CONNECTION_SPECIFICATION = { + "type": "object", + "required": ["api_token"], + "additionalProperties": False, + "properties": {"api_token": {"type": "string"}}, + } + + @pytest.fixture + def use_json_spec(self): + spec = { + "documentationUrl": "https://airbyte.com/#json", + "connectionSpecification": self.CONNECTION_SPECIFICATION, + } + + json_path = os.path.join(SPEC_ROOT, "spec.json") + with open(json_path, "w") as f: + f.write(json.dumps(spec)) + yield + os.remove(json_path) + + @pytest.fixture + def use_yaml_spec(self): + spec = {"documentationUrl": "https://airbyte.com/#yaml", "connectionSpecification": self.CONNECTION_SPECIFICATION} + + yaml_path = os.path.join(SPEC_ROOT, "spec.yaml") + with open(yaml_path, "w") as f: + f.write(yaml.dump(spec)) + yield + os.remove(yaml_path) + + def test_spec_from_json_file(self, integration, use_json_spec): + connector_spec = integration.spec(logger) + assert connector_spec.documentationUrl == "https://airbyte.com/#json" + assert connector_spec.connectionSpecification == self.CONNECTION_SPECIFICATION + + def test_spec_from_yaml_file(self, integration, use_yaml_spec): + connector_spec = integration.spec(logger) + assert connector_spec.documentationUrl == "https://airbyte.com/#yaml" + assert connector_spec.connectionSpecification == self.CONNECTION_SPECIFICATION + + def test_multiple_spec_files_raises_exception(self, integration, use_yaml_spec, use_json_spec): + with pytest.raises(RuntimeError, match="spec.yaml or spec.json"): + integration.spec(logger) + + def test_no_spec_file_raises_exception(self, integration): + with pytest.raises(FileNotFoundError, match="Unable to find spec."): + integration.spec(logger) diff --git a/airbyte-cdk/python/unit_tests/test_counter.py b/airbyte-cdk/python/unit_tests/test_counter.py index c6ffd9a5589d7..74581b3c01321 100644 --- a/airbyte-cdk/python/unit_tests/test_counter.py +++ b/airbyte-cdk/python/unit_tests/test_counter.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # diff --git a/airbyte-cdk/python/unit_tests/test_entrypoint.py b/airbyte-cdk/python/unit_tests/test_entrypoint.py index f34c9c033055e..26bc5b8554102 100644 --- a/airbyte-cdk/python/unit_tests/test_entrypoint.py +++ b/airbyte-cdk/python/unit_tests/test_entrypoint.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # @@ -10,6 +10,7 @@ import pytest from airbyte_cdk import AirbyteEntrypoint +from airbyte_cdk import entrypoint as entrypoint_module from airbyte_cdk.models import ( AirbyteCatalog, AirbyteConnectionStatus, @@ -38,7 +39,8 @@ def _as_arglist(cmd: str, named_args: Mapping[str, Any]) -> List[str]: out = [cmd] for k, v in named_args.items(): out.append(f"--{k}") - out.append(v) + if v: + out.append(v) return out @@ -55,20 +57,34 @@ def entrypoint() -> AirbyteEntrypoint: return AirbyteEntrypoint(MockSource()) +def test_airbyte_entrypoint_init(mocker): + mocker.patch.object(entrypoint_module, "init_uncaught_exception_handler") + AirbyteEntrypoint(MockSource()) + entrypoint_module.init_uncaught_exception_handler.assert_called_once_with(entrypoint_module.logger) + + @pytest.mark.parametrize( - ["cmd", "args"], + ["cmd", "args", "expected_args"], [ - ("spec", dict()), - ("check", {"config": "config_path"}), - ("discover", {"config": "config_path"}), - ("read", {"config": "config_path", "catalog": "catalog_path", "state": "None"}), - ("read", {"config": "config_path", "catalog": "catalog_path", "state": "state_path"}), + ("spec", {"debug": ""}, {"command": "spec", "debug": True}), + ("check", {"config": "config_path"}, {"command": "check", "config": "config_path", "debug": False}), + ("discover", {"config": "config_path", "debug": ""}, {"command": "discover", "config": "config_path", "debug": True}), + ( + "read", + {"config": "config_path", "catalog": "catalog_path", "state": "None"}, + {"command": "read", "config": "config_path", "catalog": "catalog_path", "state": "None", "debug": False}, + ), + ( + "read", + {"config": "config_path", "catalog": "catalog_path", "state": "state_path", "debug": ""}, + {"command": "read", "config": "config_path", "catalog": "catalog_path", "state": "state_path", "debug": True}, + ), ], ) -def test_parse_valid_args(cmd: str, args: Mapping[str, Any], entrypoint: AirbyteEntrypoint): +def test_parse_valid_args(cmd: str, args: Mapping[str, Any], expected_args, entrypoint: AirbyteEntrypoint): arglist = _as_arglist(cmd, args) parsed_args = entrypoint.parse_args(arglist) - assert {"command": cmd, **args} == vars(parsed_args) + assert vars(parsed_args) == expected_args @pytest.mark.parametrize( diff --git a/airbyte-cdk/python/unit_tests/test_exception_handler.py b/airbyte-cdk/python/unit_tests/test_exception_handler.py new file mode 100644 index 0000000000000..7c9d84bb454cc --- /dev/null +++ b/airbyte-cdk/python/unit_tests/test_exception_handler.py @@ -0,0 +1,57 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + + +import json +import subprocess +import sys + +import pytest +from airbyte_cdk.models import AirbyteErrorTraceMessage, AirbyteLogMessage, AirbyteMessage, AirbyteTraceMessage + + +def test_uncaught_exception_handler(): + cmd = "from airbyte_cdk.logger import init_logger; from airbyte_cdk.exception_handler import init_uncaught_exception_handler; logger = init_logger('airbyte'); init_uncaught_exception_handler(logger); raise 1" + exception_message = "exceptions must derive from BaseException" + exception_trace = ( + "Traceback (most recent call last):\n" + ' File "", line 1, in \n' + "TypeError: exceptions must derive from BaseException" + ) + + expected_log_message = AirbyteMessage( + type="LOG", log=AirbyteLogMessage(level="FATAL", message=f"{exception_message}\n{exception_trace}") + ) + + expected_trace_message = AirbyteMessage( + type="TRACE", + trace=AirbyteTraceMessage( + type="ERROR", + emitted_at=0.0, + error=AirbyteErrorTraceMessage( + failure_type="system_error", + message="Something went wrong in the connector. See the logs for more details.", + internal_message=exception_message, + stack_trace=f"{exception_trace}\n", + ), + ), + ) + + with pytest.raises(subprocess.CalledProcessError) as err: + subprocess.check_output([sys.executable, "-c", cmd], stderr=subprocess.STDOUT) + + assert not err.value.stderr, "nothing on the stderr" + + stdout_lines = err.value.output.decode("utf-8").strip().split("\n") + assert len(stdout_lines) == 2 + + log_output, trace_output = stdout_lines + + out_log_message = AirbyteMessage.parse_obj(json.loads(log_output)) + assert out_log_message == expected_log_message, "Log message should be emitted in expected form" + + out_trace_message = AirbyteMessage.parse_obj(json.loads(trace_output)) + assert out_trace_message.trace.emitted_at > 0 + out_trace_message.trace.emitted_at = 0.0 # set a specific emitted_at value for testing + assert out_trace_message == expected_trace_message, "Trace message should be emitted in expected form" diff --git a/airbyte-cdk/python/unit_tests/test_logger.py b/airbyte-cdk/python/unit_tests/test_logger.py index 96b5be2052089..e90184a3fcb1e 100644 --- a/airbyte-cdk/python/unit_tests/test_logger.py +++ b/airbyte-cdk/python/unit_tests/test_logger.py @@ -1,17 +1,13 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # - import json import logging -import subprocess -import sys from typing import Dict import pytest from airbyte_cdk.logger import AirbyteLogFormatter -from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage @pytest.fixture(scope="session") @@ -53,20 +49,21 @@ def test_level_transform(logger, caplog): assert level_critical == "FATAL" -def test_trace(logger, caplog): - logger.log(logging.getLevelName("TRACE"), "Test trace 1") - record = caplog.records[0] - assert record.levelname == "TRACE" - assert record.message == "Test trace 1" - - def test_debug(logger, caplog): - logger.debug("Test debug 1") + # Test debug logger in isolation since the default logger is initialized to TRACE (15) instead of DEBUG (10). + debug_logger = logging.getLogger("airbyte.Debuglogger") + debug_logger.setLevel(logging.DEBUG) + debug_logger.debug("Test debug 1") record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message == "Test debug 1" +def test_default_debug_is_ignored(logger, caplog): + logger.debug("Test debug that is ignored since log level is TRACE") + assert len(caplog.records) == 0 + + def test_info(logger, caplog): logger.info("Test info 1") logger.info("Test info 2") @@ -95,21 +92,3 @@ def test_fatal(logger, caplog): record = caplog.records[0] assert record.levelname == "CRITICAL" assert record.message == "Test fatal 1" - - -def test_unhandled_logger(): - cmd = "from airbyte_cdk.logger import init_logger; init_logger('airbyte'); raise 1" - expected_message = ( - "exceptions must derive from BaseException\n" - "Traceback (most recent call last):\n" - ' File "", line 1, in \n' - "TypeError: exceptions must derive from BaseException" - ) - log_message = AirbyteMessage(type="LOG", log=AirbyteLogMessage(level="FATAL", message=expected_message)) - expected_output = log_message.json(exclude_unset=True) - - with pytest.raises(subprocess.CalledProcessError) as err: - subprocess.check_output([sys.executable, "-c", cmd], stderr=subprocess.STDOUT) - - assert not err.value.stderr, "nothing on the stderr" - assert err.value.output.decode("utf-8").strip() == expected_output, "Error should be printed in expected form" diff --git a/airbyte-cdk/python/unit_tests/test_secure_logger.py b/airbyte-cdk/python/unit_tests/test_secure_logger.py index ed1e86cfc4dbb..d7e7d37940377 100644 --- a/airbyte-cdk/python/unit_tests/test_secure_logger.py +++ b/airbyte-cdk/python/unit_tests/test_secure_logger.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2021 Airbyte, Inc., all rights reserved. +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. # import logging @@ -145,7 +145,7 @@ def test_airbyte_secret_is_masked_on_logger_output(source_spec, mocker, config, assert all([str(v) in log_result for v in expected_plain_text_values]) -def test_airbyte_secrets_are_masked_on_uncaught_exceptions(mocker, caplog): +def test_airbyte_secrets_are_masked_on_uncaught_exceptions(mocker, caplog, capsys): caplog.set_level(logging.DEBUG, logger="airbyte.test") caplog.handler.setFormatter(AirbyteLogFormatter()) @@ -188,10 +188,11 @@ def read( list(entrypoint.run(parsed_args)) except Exception: sys.excepthook(*sys.exc_info()) - assert I_AM_A_SECRET_VALUE not in caplog.text, "Should have filtered secret value from exception" + assert I_AM_A_SECRET_VALUE not in capsys.readouterr().out, "Should have filtered non-secret value from exception trace message" + assert I_AM_A_SECRET_VALUE not in caplog.text, "Should have filtered secret value from exception log message" -def test_non_airbyte_secrets_are_not_masked_on_uncaught_exceptions(mocker, caplog): +def test_non_airbyte_secrets_are_not_masked_on_uncaught_exceptions(mocker, caplog, capsys): caplog.set_level(logging.DEBUG, logger="airbyte.test") caplog.handler.setFormatter(AirbyteLogFormatter()) @@ -235,4 +236,5 @@ def read( list(entrypoint.run(parsed_args)) except Exception: sys.excepthook(*sys.exc_info()) - assert NOT_A_SECRET_VALUE in caplog.text, "Should not have filtered non-secret value from exception" + assert NOT_A_SECRET_VALUE in capsys.readouterr().out, "Should not have filtered non-secret value from exception trace message" + assert NOT_A_SECRET_VALUE in caplog.text, "Should not have filtered non-secret value from exception log message" diff --git a/airbyte-integrations/connectors/destination-databricks/lib/.keep b/airbyte-cdk/python/unit_tests/utils/__init__.py similarity index 100% rename from airbyte-integrations/connectors/destination-databricks/lib/.keep rename to airbyte-cdk/python/unit_tests/utils/__init__.py diff --git a/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py b/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py new file mode 100644 index 0000000000000..0694b2786da7b --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_secret_utils.py @@ -0,0 +1,118 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import pytest +from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets, get_secret_paths, get_secrets, update_secrets + +SECRET_STRING_KEY = "secret_key1" +SECRET_STRING_VALUE = "secret_value" +SECRET_STRING_2_KEY = "secret_key2" +SECRET_STRING_2_VALUE = "second_secret_val" +SECRET_INT_KEY = "secret_int" +SECRET_INT_VALUE = 1337 +NOT_SECRET_KEY = "not_a_secret" +NOT_SECRET_VALUE = "unimportant value" + + +flat_spec_with_secret = {"properties": {SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}} +flat_config_with_secret = {SECRET_STRING_KEY: SECRET_STRING_VALUE, NOT_SECRET_KEY: NOT_SECRET_VALUE} + +flat_spec_with_secret_int = { + "properties": {SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}} +} +flat_config_with_secret_int = {SECRET_INT_KEY: SECRET_INT_VALUE, NOT_SECRET_KEY: NOT_SECRET_VALUE} + +flat_spec_without_secrets = {"properties": {NOT_SECRET_KEY: {"type": "string"}}} +flat_config_without_secrets = {NOT_SECRET_KEY: NOT_SECRET_VALUE} + +spec_with_oneof_secrets = { + "properties": { + SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + "credentials": { + "type": "object", + "oneOf": [ + { + "type": "object", + "properties": {SECRET_STRING_2_KEY: {"type": "string", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}, + }, + { + "type": "object", + "properties": {SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, NOT_SECRET_KEY: {"type": "string"}}, + }, + ], + }, + } +} +config_with_oneof_secrets_1 = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_STRING_2_KEY: SECRET_STRING_2_VALUE}, +} +config_with_oneof_secrets_2 = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_INT_KEY: SECRET_INT_VALUE}, +} + +spec_with_nested_secrets = { + "properties": { + SECRET_STRING_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + "credentials": { + "type": "object", + "properties": { + SECRET_STRING_2_KEY: {"type": "string", "airbyte_secret": True}, + NOT_SECRET_KEY: {"type": "string"}, + SECRET_INT_KEY: {"type": "integer", "airbyte_secret": True}, + }, + }, + } +} +config_with_nested_secrets = { + SECRET_STRING_KEY: SECRET_STRING_VALUE, + NOT_SECRET_KEY: NOT_SECRET_VALUE, + "credentials": {SECRET_STRING_2_KEY: SECRET_STRING_2_VALUE, SECRET_INT_KEY: SECRET_INT_VALUE}, +} + + +@pytest.mark.parametrize( + ["spec", "expected"], + [ + (flat_spec_with_secret, [[SECRET_STRING_KEY]]), + (flat_spec_without_secrets, []), + (flat_spec_with_secret_int, [[SECRET_INT_KEY]]), + (spec_with_oneof_secrets, [[SECRET_STRING_KEY], ["credentials", SECRET_STRING_2_KEY], ["credentials", SECRET_INT_KEY]]), + (spec_with_nested_secrets, [[SECRET_STRING_KEY], ["credentials", SECRET_STRING_2_KEY], ["credentials", SECRET_INT_KEY]]), + ], +) +def test_get_secret_paths(spec, expected): + assert get_secret_paths(spec) == expected, f"Expected {spec} to yield secret paths {expected}" + + +@pytest.mark.parametrize( + ["spec", "config", "expected"], + [ + (flat_spec_with_secret, flat_config_with_secret, [SECRET_STRING_VALUE]), + (flat_spec_without_secrets, flat_config_without_secrets, []), + (flat_spec_with_secret_int, flat_config_with_secret_int, [SECRET_INT_VALUE]), + (spec_with_oneof_secrets, config_with_oneof_secrets_1, [SECRET_STRING_VALUE, SECRET_STRING_2_VALUE]), + (spec_with_oneof_secrets, config_with_oneof_secrets_2, [SECRET_STRING_VALUE, SECRET_INT_VALUE]), + (spec_with_nested_secrets, config_with_nested_secrets, [SECRET_STRING_VALUE, SECRET_STRING_2_VALUE, SECRET_INT_VALUE]), + ], +) +def test_get_secrets(spec, config, expected): + assert get_secrets(spec, config) == expected, f"Expected the spec {spec} and config {config} to produce {expected}" + + +def test_secret_filtering(): + sensitive_str = f"{SECRET_STRING_VALUE} {NOT_SECRET_VALUE} {SECRET_STRING_VALUE} {SECRET_STRING_2_VALUE}" + + update_secrets([]) + filtered = filter_secrets(sensitive_str) + assert filtered == sensitive_str + + update_secrets([SECRET_STRING_VALUE, SECRET_STRING_2_VALUE]) + filtered = filter_secrets(sensitive_str) + assert filtered == f"**** {NOT_SECRET_VALUE} **** ****" diff --git a/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py b/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py new file mode 100644 index 0000000000000..c95d0bf48dcff --- /dev/null +++ b/airbyte-cdk/python/unit_tests/utils/test_traced_exception.py @@ -0,0 +1,83 @@ +# +# Copyright (c) 2022 Airbyte, Inc., all rights reserved. +# + +import json + +import pytest +from airbyte_cdk.models.airbyte_protocol import AirbyteErrorTraceMessage, AirbyteMessage, AirbyteTraceMessage, FailureType, TraceType +from airbyte_cdk.models.airbyte_protocol import Type as MessageType +from airbyte_cdk.utils.traced_exception import AirbyteTracedException + + +@pytest.fixture +def raised_exception(): + try: + raise RuntimeError("an error has occurred") + except RuntimeError as e: + return e + + +def test_build_from_existing_exception(raised_exception): + traced_exc = AirbyteTracedException.from_exception(raised_exception, message="my user-friendly message") + assert traced_exc.message == "my user-friendly message" + assert traced_exc.internal_message == "an error has occurred" + assert traced_exc.failure_type == FailureType.system_error + assert traced_exc._exception == raised_exception + + +def test_exception_as_airbyte_message(): + traced_exc = AirbyteTracedException("an internal message") + airbyte_message = traced_exc.as_airbyte_message() + + assert type(airbyte_message) == AirbyteMessage + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.ERROR + assert airbyte_message.trace.emitted_at > 0 + assert airbyte_message.trace.error.failure_type == FailureType.system_error + assert airbyte_message.trace.error.message == "Something went wrong in the connector. See the logs for more details." + assert airbyte_message.trace.error.internal_message == "an internal message" + assert airbyte_message.trace.error.stack_trace == "airbyte_cdk.utils.traced_exception.AirbyteTracedException: an internal message\n" + + +def test_existing_exception_as_airbyte_message(raised_exception): + traced_exc = AirbyteTracedException.from_exception(raised_exception) + airbyte_message = traced_exc.as_airbyte_message() + + assert type(airbyte_message) == AirbyteMessage + assert airbyte_message.type == MessageType.TRACE + assert airbyte_message.trace.type == TraceType.ERROR + assert airbyte_message.trace.error.message == "Something went wrong in the connector. See the logs for more details." + assert airbyte_message.trace.error.internal_message == "an error has occurred" + assert airbyte_message.trace.error.stack_trace.startswith("Traceback (most recent call last):") + assert airbyte_message.trace.error.stack_trace.endswith( + 'raise RuntimeError("an error has occurred")\n' "RuntimeError: an error has occurred\n" + ) + + +def test_emit_message(capsys): + traced_exc = AirbyteTracedException( + internal_message="internal message", message="user-friendly message", exception=RuntimeError("oh no") + ) + + expected_message = AirbyteMessage( + type="TRACE", + trace=AirbyteTraceMessage( + type="ERROR", + emitted_at=0.0, + error=AirbyteErrorTraceMessage( + failure_type="system_error", + message="user-friendly message", + internal_message="internal message", + stack_trace="RuntimeError: oh no\n", + ), + ), + ) + + traced_exc.emit_message() + + stdout = capsys.readouterr().out + printed_message = AirbyteMessage.parse_obj(json.loads(stdout)) + printed_message.trace.emitted_at = 0.0 + + assert printed_message == expected_message diff --git a/airbyte-cli/build.gradle b/airbyte-cli/build.gradle index 4cccd9d4f4018..12f7e2d94a26d 100644 --- a/airbyte-cli/build.gradle +++ b/airbyte-cli/build.gradle @@ -1,3 +1,3 @@ -Task dockerBuildTask = getDockerBuildTask("cli", "$project.projectDir") +Task dockerBuildTask = getDockerBuildTask("cli", "$project.projectDir", "$rootProject.ext.version", "$rootProject.ext.image_tag") dockerBuildTask.dependsOn(copyDocker) assemble.dependsOn(dockerBuildTask) diff --git a/airbyte-cli/readme.md b/airbyte-cli/readme.md new file mode 100644 index 0000000000000..c65791d7c4407 --- /dev/null +++ b/airbyte-cli/readme.md @@ -0,0 +1,3 @@ +# airbyte-cli + +Thin CLI over the Airbyte Configuration API to make it easier to interact with the API from the command line. diff --git a/airbyte-commons-cli/build.gradle b/airbyte-commons-cli/build.gradle index 3dbb175d2ad9a..2b9e141d8164c 100644 --- a/airbyte-commons-cli/build.gradle +++ b/airbyte-commons-cli/build.gradle @@ -5,3 +5,5 @@ plugins { dependencies { implementation 'commons-cli:commons-cli:1.4' } + +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-commons-cli/readme.md b/airbyte-commons-cli/readme.md index bd4bd8aab272f..81aa7feb0b334 100644 --- a/airbyte-commons-cli/readme.md +++ b/airbyte-commons-cli/readme.md @@ -1 +1,3 @@ +# airbyte-commons-cli + This module houses utility functions for the `commons-cli` library. It is separate from `commons`, because it depends on external library `commons-cli` which we do not want to introduce as a dependency to every module. diff --git a/airbyte-commons-cli/src/main/java/io/airbyte/commons/cli/Clis.java b/airbyte-commons-cli/src/main/java/io/airbyte/commons/cli/Clis.java index 1833d5ed52185..623ef16631ebe 100644 --- a/airbyte-commons-cli/src/main/java/io/airbyte/commons/cli/Clis.java +++ b/airbyte-commons-cli/src/main/java/io/airbyte/commons/cli/Clis.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.cli; diff --git a/airbyte-commons-cli/src/test/java/io/airbyte/commons/cli/ClisTest.java b/airbyte-commons-cli/src/test/java/io/airbyte/commons/cli/ClisTest.java index a2587c9ca18a7..f2a09b3a23793 100644 --- a/airbyte-commons-cli/src/test/java/io/airbyte/commons/cli/ClisTest.java +++ b/airbyte-commons-cli/src/test/java/io/airbyte/commons/cli/ClisTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.cli; @@ -16,10 +16,13 @@ class ClisTest { + private static final String ALPHA = "alpha"; + private static final String BETA = "beta"; + @Test void testCreateOptionGroup() { - final Option optionA = new Option("a", "alpha"); - final Option optionB = new Option("b", "beta"); + final Option optionA = new Option("a", ALPHA); + final Option optionB = new Option("b", BETA); final OptionGroup optionGroupExpected = new OptionGroup(); optionGroupExpected.addOption(optionA); optionGroupExpected.addOption(optionB); @@ -38,10 +41,10 @@ void testParse() { final Option optionA = Option.builder("a").required(true).hasArg(true).build(); final Option optionB = Option.builder("b").required(true).hasArg(true).build(); final Options options = new Options().addOption(optionA).addOption(optionB); - final String[] args = {"-a", "alpha", "-b", "beta"}; + final String[] args = {"-a", ALPHA, "-b", BETA}; final CommandLine parsed = Clis.parse(args, options, new DefaultParser()); - assertEquals("alpha", parsed.getOptions()[0].getValue()); - assertEquals("beta", parsed.getOptions()[1].getValue()); + assertEquals(ALPHA, parsed.getOptions()[0].getValue()); + assertEquals(BETA, parsed.getOptions()[1].getValue()); } @Test @@ -49,7 +52,7 @@ void testParseNonConforming() { final Option optionA = Option.builder("a").required(true).hasArg(true).build(); final Option optionB = Option.builder("b").required(true).hasArg(true).build(); final Options options = new Options().addOption(optionA).addOption(optionB); - final String[] args = {"-a", "alpha", "-b", "beta", "-c", "charlie"}; + final String[] args = {"-a", ALPHA, "-b", BETA, "-c", "charlie"}; assertThrows(IllegalArgumentException.class, () -> Clis.parse(args, options, new DefaultParser())); } @@ -58,7 +61,7 @@ void testParseNonConformingWithSyntax() { final Option optionA = Option.builder("a").required(true).hasArg(true).build(); final Option optionB = Option.builder("b").required(true).hasArg(true).build(); final Options options = new Options().addOption(optionA).addOption(optionB); - final String[] args = {"-a", "alpha", "-b", "beta", "-c", "charlie"}; + final String[] args = {"-a", ALPHA, "-b", BETA, "-c", "charlie"}; assertThrows(IllegalArgumentException.class, () -> Clis.parse(args, options, new DefaultParser(), "search")); } @@ -67,10 +70,10 @@ void testRelaxedParser() { final Option optionA = Option.builder("a").required(true).hasArg(true).build(); final Option optionB = Option.builder("b").required(true).hasArg(true).build(); final Options options = new Options().addOption(optionA).addOption(optionB); - final String[] args = {"-a", "alpha", "-b", "beta", "-c", "charlie"}; + final String[] args = {"-a", ALPHA, "-b", BETA, "-c", "charlie"}; final CommandLine parsed = Clis.parse(args, options, Clis.getRelaxedParser()); - assertEquals("alpha", parsed.getOptions()[0].getValue()); - assertEquals("beta", parsed.getOptions()[1].getValue()); + assertEquals(ALPHA, parsed.getOptions()[0].getValue()); + assertEquals(BETA, parsed.getOptions()[1].getValue()); } } diff --git a/airbyte-commons-docker/build.gradle b/airbyte-commons-docker/build.gradle index 4c2ec5099bfc2..7fc9a0b2df945 100644 --- a/airbyte-commons-docker/build.gradle +++ b/airbyte-commons-docker/build.gradle @@ -9,3 +9,5 @@ dependencies { testImplementation 'org.apache.commons:commons-lang3:3.11' } + +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-commons-docker/readme.md b/airbyte-commons-docker/readme.md new file mode 100644 index 0000000000000..46ed14b70f142 --- /dev/null +++ b/airbyte-commons-docker/readme.md @@ -0,0 +1,3 @@ +# airbyte-commons-docker + +This module contains common helpers for interacting with Docker and Docker images from Java. diff --git a/airbyte-commons-docker/src/main/java/io/airbyte/commons/docker/DockerUtils.java b/airbyte-commons-docker/src/main/java/io/airbyte/commons/docker/DockerUtils.java index 4d50e48f6195f..254e81f483d82 100644 --- a/airbyte-commons-docker/src/main/java/io/airbyte/commons/docker/DockerUtils.java +++ b/airbyte-commons-docker/src/main/java/io/airbyte/commons/docker/DockerUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.docker; diff --git a/airbyte-commons-docker/src/test/java/io/airbyte/commons/docker/DockerUtilsTest.java b/airbyte-commons-docker/src/test/java/io/airbyte/commons/docker/DockerUtilsTest.java index b6dcd7fdc2c4c..ebb013d85fcda 100644 --- a/airbyte-commons-docker/src/test/java/io/airbyte/commons/docker/DockerUtilsTest.java +++ b/airbyte-commons-docker/src/test/java/io/airbyte/commons/docker/DockerUtilsTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.docker; diff --git a/airbyte-commons/build.gradle b/airbyte-commons/build.gradle index 1c8e4129e8c26..3c4c0f9e19617 100644 --- a/airbyte-commons/build.gradle +++ b/airbyte-commons/build.gradle @@ -3,5 +3,10 @@ plugins { } dependencies { - // Dependencies for this module should be specified in the top-level build.gradle. + // Dependencies for this module should be specified in the top-level build.gradle. See readme for more explanation. + + // this dependency is an exception to the above rule because it is only used INTERNALLY to the commons library. + implementation 'com.jayway.jsonpath:json-path:2.7.0' } + +Task publishArtifactsTask = getPublishArtifactsTask("$rootProject.ext.version", project) diff --git a/airbyte-commons/readme.md b/airbyte-commons/readme.md new file mode 100644 index 0000000000000..acbd8542866a5 --- /dev/null +++ b/airbyte-commons/readme.md @@ -0,0 +1,7 @@ +# airbyte-commons + +Common java helpers. + +This submodule is inherited by all other java modules in the monorepo! It is therefore important that we do not add dependencies to it, as those dependencies will also be added to every java module. The only dependencies that this module uses are the ones declared in the `build.gradle` at the root of the Airbyte monorepo. In other words it only uses dependencies that are already shared across all modules. The `dependencies` section of the `build.gradle` of `airbyte-commons` should always be empty. + +For other common java code that needs to be shared across modules that requires additional dependencies, we follow this convention: `airbyte-commons-`. See for example `airbyte-commons-cli` and `airbyte-commons-docker`. diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/bytes/ByteUtils.java b/airbyte-commons/src/main/java/io/airbyte/commons/bytes/ByteUtils.java deleted file mode 100644 index 1ba95a5b735d0..0000000000000 --- a/airbyte-commons/src/main/java/io/airbyte/commons/bytes/ByteUtils.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. - */ - -package io.airbyte.commons.bytes; - -import java.nio.charset.StandardCharsets; - -public class ByteUtils { - - /** - * Encodes this String into a sequence of bytes using the given charset. UTF-8 is based on 8-bit - * code units. Each character is encoded as 1 to 4 bytes. The first 128 Unicode code points are - * encoded as 1 byte in UTF-8. - * - * @param s - string where charset length will be counted - * @return length of bytes for charset - */ - public static long getSizeInBytesForUTF8CharSet(String s) { - return s.getBytes(StandardCharsets.UTF_8).length; - } - -} diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/GracefulShutdownHandler.java b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/GracefulShutdownHandler.java index ad4adcaa09009..809ecf22d00df 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/GracefulShutdownHandler.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/GracefulShutdownHandler.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.concurrency; @@ -18,7 +18,7 @@ public class GracefulShutdownHandler extends Thread { public GracefulShutdownHandler(final Duration terminateWaitDuration, final ExecutorService... threadPools) { this.terminateWaitDuration = terminateWaitDuration; - this.threadPools = threadPools; + this.threadPools = threadPools.clone(); } @Override diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/LifecycledCallable.java b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/LifecycledCallable.java index 5fd9459ea25c8..e556cfa99f992 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/LifecycledCallable.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/LifecycledCallable.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.concurrency; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/VoidCallable.java b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/VoidCallable.java index ac502ab5e2914..c61802a70d0ba 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/VoidCallable.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/VoidCallable.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.concurrency; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/WaitingUtils.java b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/WaitingUtils.java index f12e309a799bb..6ee93240616d7 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/WaitingUtils.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/concurrency/WaitingUtils.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.concurrency; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/enums/Enums.java b/airbyte-commons/src/main/java/io/airbyte/commons/enums/Enums.java index ccb47d1d1f546..1a5c59670a728 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/enums/Enums.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/enums/Enums.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.enums; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/features/EnvVariableFeatureFlags.java b/airbyte-commons/src/main/java/io/airbyte/commons/features/EnvVariableFeatureFlags.java index b89ca268c9eba..9991fd35c5036 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/features/EnvVariableFeatureFlags.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/features/EnvVariableFeatureFlags.java @@ -1,14 +1,48 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.features; +import java.util.function.Function; +import lombok.extern.slf4j.Slf4j; + +@Slf4j public class EnvVariableFeatureFlags implements FeatureFlags { + public static final String USE_STREAM_CAPABLE_STATE = "USE_STREAM_CAPABLE_STATE"; + + @Override + public boolean autoDisablesFailingConnections() { + log.info("Auto Disable Failing Connections: " + Boolean.parseBoolean(System.getenv("AUTO_DISABLE_FAILING_CONNECTIONS"))); + + return Boolean.parseBoolean(System.getenv("AUTO_DISABLE_FAILING_CONNECTIONS")); + } + + @Override + public boolean exposeSecretsInExport() { + return Boolean.parseBoolean(System.getenv("EXPOSE_SECRETS_IN_EXPORT")); + } + @Override - public boolean usesNewScheduler() { - return Boolean.parseBoolean(System.getenv("NEW_SCHEDULER")); + public boolean forceSecretMigration() { + return Boolean.parseBoolean(System.getenv("FORCE_MIGRATE_SECRET_STORE")); + } + + @Override + public boolean useStreamCapableState() { + return getEnvOrDefault(USE_STREAM_CAPABLE_STATE, false, Boolean::parseBoolean); + } + + // TODO: refactor in order to use the same method than the ones in EnvConfigs.java + public T getEnvOrDefault(final String key, final T defaultValue, final Function parser) { + final String value = System.getenv(key); + if (value != null && !value.isEmpty()) { + return parser.apply(value); + } else { + log.info("Using default value for environment variable {}: '{}'", key, defaultValue); + return defaultValue; + } } } diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/features/FeatureFlags.java b/airbyte-commons/src/main/java/io/airbyte/commons/features/FeatureFlags.java index 02a5974c12aed..1053e19035988 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/features/FeatureFlags.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/features/FeatureFlags.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.features; @@ -10,6 +10,12 @@ */ public interface FeatureFlags { - boolean usesNewScheduler(); + boolean autoDisablesFailingConnections(); + + boolean exposeSecretsInExport(); + + boolean forceSecretMigration(); + + boolean useStreamCapableState(); } diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiConsumer.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiConsumer.java index 1209364c32413..8ce45f530abc3 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiConsumer.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiConsumer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiFunction.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiFunction.java index 68fa308428f52..f2d00d1d699df 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiFunction.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedBiFunction.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedConsumer.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedConsumer.java index 0853e8d32dc3f..89133463ae9d4 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedConsumer.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedConsumer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedFunction.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedFunction.java index c06232d1eac83..cd959e5b15593 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedFunction.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedFunction.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedSupplier.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedSupplier.java index a09e64cc81719..5fd62db014abc 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedSupplier.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/CheckedSupplier.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/Consumers.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/Consumers.java index 15c960c0a128a..9738a4cd71303 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/Consumers.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/Consumers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/functional/ListConsumer.java b/airbyte-commons/src/main/java/io/airbyte/commons/functional/ListConsumer.java index 5aa0d118571b3..e63fb5f69ebb9 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/functional/ListConsumer.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/functional/ListConsumer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.functional; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/io/Archives.java b/airbyte-commons/src/main/java/io/airbyte/commons/io/Archives.java index 034944b945797..0e81c1d6adf33 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/io/Archives.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/io/Archives.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.io; @@ -18,13 +18,9 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class Archives { - private static final Logger LOGGER = LoggerFactory.getLogger(Archives.class); - /** * Compress a @param sourceFolder into a Gzip Tarball @param archiveFile */ @@ -53,8 +49,8 @@ private static void compressFile(final Path file, final Path filename, final Tar public static void extractArchive(final Path archiveFile, final Path destinationFolder) throws IOException { final TarArchiveInputStream archive = new TarArchiveInputStream(new GzipCompressorInputStream(new BufferedInputStream(Files.newInputStream(archiveFile)))); - ArchiveEntry entry; - while ((entry = archive.getNextEntry()) != null) { + ArchiveEntry entry = archive.getNextEntry(); + while (entry != null) { final Path newPath = zipSlipProtect(entry, destinationFolder); if (entry.isDirectory()) { Files.createDirectories(newPath); @@ -67,6 +63,7 @@ public static void extractArchive(final Path archiveFile, final Path destination } Files.copy(archive, newPath, StandardCopyOption.REPLACE_EXISTING); } + entry = archive.getNextEntry(); } } diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/io/FileTtlManager.java b/airbyte-commons/src/main/java/io/airbyte/commons/io/FileTtlManager.java index 5157688f314e1..fc162bb1da567 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/io/FileTtlManager.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/io/FileTtlManager.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.io; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/io/IOs.java b/airbyte-commons/src/main/java/io/airbyte/commons/io/IOs.java index c3dce27228a48..945bfbf563660 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/io/IOs.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/io/IOs.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.io; @@ -88,9 +88,10 @@ public static List getTail(final int numLines, final Path path) throws I try (final ReversedLinesFileReader fileReader = new ReversedLinesFileReader(file, Charsets.UTF_8)) { final List lines = new ArrayList<>(); - String line; - while ((line = fileReader.readLine()) != null && lines.size() < numLines) { + String line = fileReader.readLine(); + while (line != null && lines.size() < numLines) { lines.add(line); + line = fileReader.readLine(); } Collections.reverse(lines); diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/io/LineGobbler.java b/airbyte-commons/src/main/java/io/airbyte/commons/io/LineGobbler.java index 0918ece248910..4f02a654b24ed 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/io/LineGobbler.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/io/LineGobbler.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.io; @@ -20,13 +20,14 @@ public class LineGobbler implements VoidCallable { private final static Logger LOGGER = LoggerFactory.getLogger(LineGobbler.class); + private final static String GENERIC = "generic"; public static void gobble(final InputStream is, final Consumer consumer) { - gobble(is, consumer, "generic", MdcScope.DEFAULT_BUILDER); + gobble(is, consumer, GENERIC, MdcScope.DEFAULT_BUILDER); } public static void gobble(final InputStream is, final Consumer consumer, final MdcScope.Builder mdcScopeBuilder) { - gobble(is, consumer, "generic", mdcScopeBuilder); + gobble(is, consumer, GENERIC, mdcScopeBuilder); } public static void gobble(final InputStream is, final Consumer consumer, final String caller, final MdcScope.Builder mdcScopeBuilder) { @@ -47,7 +48,7 @@ public static void gobble(final InputStream is, final Consumer consumer, final Consumer consumer, final ExecutorService executor, final Map mdc) { - this(is, consumer, executor, mdc, "generic", MdcScope.DEFAULT_BUILDER); + this(is, consumer, executor, mdc, GENERIC, MdcScope.DEFAULT_BUILDER); } LineGobbler(final InputStream is, @@ -55,7 +56,7 @@ public static void gobble(final InputStream is, final Consumer consumer, final ExecutorService executor, final Map mdc, final MdcScope.Builder mdcScopeBuilder) { - this(is, consumer, executor, mdc, "generic", mdcScopeBuilder); + this(is, consumer, executor, mdc, GENERIC, mdcScopeBuilder); } LineGobbler(final InputStream is, @@ -76,11 +77,12 @@ public static void gobble(final InputStream is, final Consumer consumer, public void voidCall() { MDC.setContextMap(mdc); try { - String line; - while ((line = is.readLine()) != null) { + String line = is.readLine(); + while (line != null) { try (final var mdcScope = containerLogMdcBuilder.build()) { consumer.accept(line); } + line = is.readLine(); } } catch (final IOException i) { LOGGER.warn("{} gobbler IOException: {}. Typically happens when cancelling a job.", caller, i.getMessage()); diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/jackson/MoreMappers.java b/airbyte-commons/src/main/java/io/airbyte/commons/jackson/MoreMappers.java index ceaf996478001..908e958056aae 100644 --- a/airbyte-commons/src/main/java/io/airbyte/commons/jackson/MoreMappers.java +++ b/airbyte-commons/src/main/java/io/airbyte/commons/jackson/MoreMappers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Airbyte, Inc., all rights reserved. + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. */ package io.airbyte.commons.jackson; diff --git a/airbyte-commons/src/main/java/io/airbyte/commons/json/JsonPaths.java b/airbyte-commons/src/main/java/io/airbyte/commons/json/JsonPaths.java new file mode 100644 index 0000000000000..4571efb4ab3d1 --- /dev/null +++ b/airbyte-commons/src/main/java/io/airbyte/commons/json/JsonPaths.java @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2022 Airbyte, Inc., all rights reserved. + */ + +package io.airbyte.commons.json; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.google.api.client.util.Preconditions; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; +import com.jayway.jsonpath.PathNotFoundException; +import com.jayway.jsonpath.spi.json.JacksonJsonNodeJsonProvider; +import com.jayway.jsonpath.spi.json.JsonProvider; +import com.jayway.jsonpath.spi.mapper.JacksonMappingProvider; +import com.jayway.jsonpath.spi.mapper.MappingProvider; +import io.airbyte.commons.json.JsonSchemas.FieldNameOrList; +import io.airbyte.commons.util.MoreIterators; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.function.BiFunction; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * JSONPath is specification for querying JSON objects. More information about the specification can + * be found here: https://goessner.net/articles/JsonPath/. For those familiar with jq, JSONPath will + * be most recognizable as "that DSL that jq uses". + * + * We use a java implementation of this specification (repo: https://github.com/json-path/JsonPath). + * This class wraps that implementation to make it easier to leverage this tool internally. + * + * GOTCHA: Keep in mind with JSONPath, depending on the query, 0, 1, or N values may be returned. + * The pattern for handling return values is very much like writing SQL queries. When using it, you + * must consider what the number of return values for your query might be. e.g. for this object: { + * "alpha": [1, 2, 3] }, this JSONPath "$.alpha[*]", would return: [1, 2, 3], but this one + * "$.alpha[0]" would return: [1]. The Java interface we place over this query system defaults to + * returning a list for query results. In addition, we provide helper functions that will just + * return a single value (see: {@link JsonPaths#getSingleValue(JsonNode, String)}). These should + * only be used if it is not possible for a query to return more than one value. + */ +public class JsonPaths { + + private static final Logger LOGGER = LoggerFactory.getLogger(JsonPaths.class); + + static final String JSON_PATH_START_CHARACTER = "$"; + static final String JSON_PATH_LIST_SPLAT = "[*]"; + static final String JSON_PATH_FIELD_SEPARATOR = "."; + + // set default configurations at start up to match our JSON setup. + static { + Configuration.setDefaults(new Configuration.Defaults() { + + // allows us to pass in Jackson JsonNode + private static final JsonProvider jsonProvider = new JacksonJsonNodeJsonProvider(); + private static final MappingProvider mappingProvider = new JacksonMappingProvider(); + + @Override + public JsonProvider jsonProvider() { + return jsonProvider; + } + + @Override + public MappingProvider mappingProvider() { + return mappingProvider; + } + + @Override + public Set