For instance, we have two datasources : workbook1
and workbook4
, with different schemas, but some common columns
[root@ip-172-31-2-115 tmp]# curl -X POST http://ip-172-31-10-21.us-west-2.compute.internal:8082/druid/v2/?pretty -H 'Content-Type:application/json' -d @query_workbook1.json
[ {
"segmentId" : "Workbook1_2018-01-01T00:00:00.000Z_2018-02-01T00:00:00.000Z_2018-06-25T20:57:45.505Z",
"columns" : [ "__time", "card number", "email id", "name" ],
"events" : [ [ 1514764800000, "123456", "jason@abc.com", "jason b" ], [ 1514851200000, "234567", "al@bbc.com", "al c" ], [ 1514937600000, "345678", "brian@ccd.com", "brian K" ], [ 1515024000000, "456789", "jason@abc.com", "jason b" ], [ 1515110400000, "567890", "henry@dns.com", "henry G" ] ]
[root@ip-172-31-2-115 tmp]# curl -X POST http://ip-172-31-10-21.us-west-2.compute.internal:8082/druid/v2/?pretty -H 'Content-Type:application/json' -d @query_workbook4.json
[ {
"segmentId" : "Workbook4_2018-01-01T00:00:00.000Z_2018-02-01T00:00:00.000Z_2018-06-25T21:36:28.017Z",
"columns" : [ "__time", "card number", "email id", "address", "zip" ],
"events" : [ [ 1515628800000, "654321", "jason@abc.com", "123 any street", "95434" ], [ 1515715200000, "765432", "al@bbc.com", "345 unknown ave", "78643" ], [ 1515801600000, "876543", "tom@llc.com", "653 where blvd", "88976" ], [ 1515888000000, "987654", "jason@abc.com", "1 goto block", "22345" ], [ 1515974400000, "987651", "mike@fgb.com", "889 mapper ave", "12001" ] ]
} ]
Then I can create a UNON query, to find card number
for email id = jason@abc.com
, from both datasources:
[root@ip-172-31-2-115 tmp]# cat query_workbook_union2.json
{
"queryType": "scan",
"dataSource": {
"type": "union",
"dataSources": ["Workbook1", "Workbook4"]
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"virtualColumns": [],
"resultFormat": "compactedList",
"batchSize": 20480,
"limit": 5000,
"filter": {
"type": "selector",
"dimension": "email id",
"value": "jason@abc.com"
},
"columns": [
"__time",
"card number",
"email id"
],
"legacy": false,
"context": {
"timeout": 300000
},
"descending": false,
"granularity": {
"type": "all"
}
}
and the output reads:
[root@ip-172-31-2-115 tmp]# curl -X POST http://ip-172-31-10-21.us-west-2.compute.internal:8082/druid/v2/?pretty -H 'Content-Type:application/json' -d @query_workbook_union2.json
[ {
"segmentId" : "Workbook1_2018-01-01T00:00:00.000Z_2018-02-01T00:00:00.000Z_2018-06-25T20:57:45.505Z",
"columns" : [ "__time", "card number", "email id" ],
"events" : [ [ 1514764800000, "123456", "jason@abc.com" ], [ 1515024000000, "456789", "jason@abc.com" ] ]
}, {
"segmentId" : "Workbook4_2018-01-01T00:00:00.000Z_2018-02-01T00:00:00.000Z_2018-06-25T21:36:28.017Z",
"columns" : [ "__time", "card number", "email id" ],
"events" : [ [ 1515628800000, "654321", "jason@abc.com" ], [ 1515888000000, "987654", "jason@abc.com" ] ]
} ]
The UNION query performance is the same as querying two datasources simultaneously, so there's very minimal performance difference.
Comments
0 comments
Please sign in to leave a comment.