First, load the {salesforcer} and {dplyr} packages and login, if needed.
library(dplyr, warn.conflicts = FALSE)
library(salesforcer)
sf_auth()
For really large inserts, updates, deletes, upserts, and queries you can just add api_type = "Bulk 1.0"
to most functions to get the benefits of using the Bulk API instead of the SOAP or REST APIs. The change you have to make from using the REST API to the Bulk 1.0 API as simple as adding api_type="Bulk 1.0"
to your function arguments. First, let’s build a tbl_df
with two new records to be created.
<- 4
n <- paste0("Bulk-", as.integer(runif(1,1,100000)), "-")
prefix <- tibble(FirstName = rep("Test", n),
new_contacts LastName = paste0("Contact-Create-", 1:n),
My_External_Id__c=paste0(prefix, letters[1:n]))
There are some differences in the way the REST API returns response information vs. the Bulk 1.0 API. However, the changes in Salesforce are exactly the same for these two calls.
# REST
<- sf_create(new_contacts[1:2,],
rest_created_records object_name="Contact",
api_type="REST")
rest_created_records#> # A tibble: 2 × 2
#> id success
#> <chr> <lgl>
#> 1 0033s00001BXfH1AAL TRUE
#> 2 0033s00001BXfH2AAL TRUE
# Bulk
<- sf_create(new_contacts[3:4,],
bulk_created_records object_name="Contact",
api_type="Bulk 1.0")
bulk_created_records#> # A tibble: 2 × 4
#> Id Success Created Error
#> <chr> <lgl> <lgl> <lgl>
#> 1 0033s00001BXfH6AAL TRUE TRUE NA
#> 2 0033s00001BXfH7AAL TRUE TRUE NA
To show a more lengthy example of using the Bulk 1.0 API, below is a workflow of that creates 2 records, queries them, and deletes them. This is just an example. Typically, you’d want to use the Bulk APIs over the REST or SOAP APIs when dealing with over 10,000 records.
<- "Contact"
object <- sf_create(new_contacts, object_name=object, api_type="Bulk 1.0")
created_records
created_records#> # A tibble: 2 × 4
#> Id Success Created Error
#> <chr> <lgl> <lgl> <lgl>
#> 1 0033s00001BXfHBAA1 TRUE TRUE NA
#> 2 0033s00001BXfHCAA1 TRUE TRUE NA
# query bulk
<- sprintf("SELECT Id,
my_soql FirstName,
LastName
FROM Contact
WHERE Id in ('%s')",
paste0(created_records$Id , collapse="','"))
<- sf_query(my_soql, object_name=object, api_type="Bulk 1.0")
queried_records
queried_records#> # A tibble: 2 × 3
#> Id FirstName LastName
#> <chr> <chr> <chr>
#> 1 0033s00001BXfHBAA1 Test Contact-Create-1
#> 2 0033s00001BXfHCAA1 Test Contact-Create-2
# delete bulk
<- sf_delete(queried_records$Id, object_name=object, api_type="Bulk 1.0")
deleted_records
deleted_records#> # A tibble: 2 × 4
#> Id Success Created Error
#> <chr> <lgl> <lgl> <lgl>
#> 1 0033s00001BXfHBAA1 TRUE FALSE NA
#> 2 0033s00001BXfHCAA1 TRUE FALSE NA
There is one limitation to Bulk queries is that it does not support the following operations or structures of SOQL:
Salesforce has more recently introduced the Bulk 2.0 API which is supposed to be faster and have a more consistent JSON/REST based API than the Bulk 1.0 API. In some cases I have noticed that the ordering of the result records will differ from the order of the input data because the data is batched and processed asynchronously. by Salesforce instead of R. However, The Bulk 2.0 API returns every single field that was included in the call so if you have an identifying key your dataset, then it should not be a problem to join on that key with your original data to bring in the newly assigned Salesforce Id that is generated when the record was created in Salesforce. However, I have find it just seems wasteful to transfer all of the field information back after the query and have not found a significant performance improvement between the Bulk 1.0 and Bulk 2.0. Finally, note that the status field names (“Success”, “Created”, “Error”) are different from the Bulk 2.0 API.
<- 20
n <- paste0("Bulk-", as.integer(runif(1,1,100000)), "-")
prefix <- tibble(FirstName = rep("Test", n),
new_contacts LastName = paste0("Contact-Create-", 1:n),
test_number__c = 1:n,
My_External_Id__c=paste0(prefix, letters[1:n]))
<- sf_create(new_contacts[1:10,],
created_records_v1 object_name = "Contact",
api_type = "Bulk 1.0")
created_records_v1#> # A tibble: 10 × 4
#> Id Success Created Error
#> <chr> <lgl> <lgl> <lgl>
#> 1 0033s00001BXefxAAD TRUE TRUE NA
#> 2 0033s00001BXefyAAD TRUE TRUE NA
#> 3 0033s00001BXefzAAD TRUE TRUE NA
#> 4 0033s00001BXeg0AAD TRUE TRUE NA
#> 5 0033s00001BXfJ9AAL TRUE TRUE NA
#> # … with 5 more rows
<- sf_create(new_contacts[11:20,],
created_records_v2 object_name = "Contact",
api_type = "Bulk 2.0")
created_records_v2#> # A tibble: 10 × 7
#> sf__Id sf__Created sf__Error FirstName LastName My_External_Id_…
#> <chr> <lgl> <lgl> <chr> <chr> <chr>
#> 1 0033s00001BXe6cAAD TRUE NA Test Contact-C… Bulk-89152-k
#> 2 0033s00001BXe6dAAD TRUE NA Test Contact-C… Bulk-89152-l
#> 3 0033s00001BXe6eAAD TRUE NA Test Contact-C… Bulk-89152-m
#> 4 0033s00001BXfJTAA1 TRUE NA Test Contact-C… Bulk-89152-n
#> 5 0033s00001BXfJUAA1 TRUE NA Test Contact-C… Bulk-89152-o
#> # … with 5 more rows, and 1 more variable: test_number__c <dbl>
Below is a simple performance benchmark between the Bulk 1.0 and Bulk 2.0 APIs for a small query. In general, the Bulk 2.0 should be faster. One potential reason for the implementation in R to be faster is that the entire recordset is parsed at once from a downloaded CSV of the results when using the Bulk 1.0 API. The Bulk 2.0 retrieves the same data in large batches (typically 50,000 records at a time). I would encourage users to experiment to see what works best in their Salesforce Org.
<- "SELECT Id, Name FROM Contact"
soql <- function(){sf_query(soql, "Contact", api_type="Bulk 1.0")}
bulk1_query <- function(){sf_query(soql, api_type="Bulk 2.0")} # Bulk 2.0 doesn't need object name
bulk2_query
<- microbenchmark::microbenchmark(
res bulk1_query(),
bulk2_query(),
times=8,
unit = "s"
)
res#> Unit: seconds
#> expr min lq mean median uq max neval
#> bulk1_query() 8.437328 8.636501 8.927289 8.822989 9.201657 9.658687 8
#> bulk2_query() 6.821483 6.914214 7.785570 7.019215 8.636023 10.324171 8
suppressWarnings(suppressMessages(
::autoplot(res) +
ggplot2::scale_y_continuous(name="Time [seconds]", n.breaks=6)
ggplot2 ))