# # Contents # # 1. JSON Documents # 2. CRUD - Create / Read / Update / Delete # a. Create # - Different ways to insert/create an index # - Bulk indexing documents # b. Read # - Basic searches # - Intermediate searches # - Sample SQL query in Elasticsearch # - Facets and aggregations # - Aggregation use cases (doc values vs inverted index?) TODO # - Sample geo search # c. Update # - Updating documents TODO # d. Delete # - Deleting documents # 3. Mappings # 4. Analyzers # This is the Kibana Dev tools console, we'll use this to interact with Elasticsearch # # Elasticsearch stores documents using the JSON format # To quickly mention JSON, this is a sample JSON document { "name" : "Elastic", "location" : { "state" : "CA", "zipcode" : 94123 } } # A document can have many fields with values { "name" : "Elastic", ... : } # And each value must be one of 6 types to be valid JSON (string, number, object, array, boolean, null) # http://www.json.org/ # Let's index our first JSON document! # When we say index, we mean store in Elasticsearch # We'll use restaurant food safety violations from the City of San Francisco, let's index one POST /inspections/_doc { "business_address": "660 Sacramento St", "business_city": "San Francisco", "business_id": "2228", "business_latitude": "37.793698", "business_location": { "type": "Point", "coordinates": [ -122.403984, 37.793698 ] }, "business_longitude": "-122.403984", "business_name": "Tokyo Express", "business_postal_code": "94111", "business_state": "CA", "inspection_date": "2016-02-04T00:00:00.000", "inspection_id": "2228_20160204", "inspection_type": "Routine", "inspection_score":96, "risk_category": "Low Risk", "violation_description": "Unclean nonfood contact surfaces", "violation_id": "2228_20160204_103142" } # See the structure of the JSON document, there is a geopoint, dates, and numbers # Let's search the index using a GET command GET /inspections/_doc/_search # We'll dive deeper into the search API soon, for now, let's focus on indexing documents # A lot just happened, let's discuss # Elasticsearch uses a REST API, and it matters whether we use POST vs PUT # PUT requires an id for the document, as part of the URL # If we run the following we'll get an error PUT /inspections/_doc { "business_address": "660 Sacramento St", "business_city": "San Francisco", "business_id": "2228", "business_latitude": "37.793698", "business_location": { "type": "Point", "coordinates": [ -122.403984, 37.793698 ] }, "business_longitude": "-122.403984", "business_name": "Tokyo Express", "business_postal_code": "94111", "business_state": "CA", "inspection_date": "2016-02-04T00:00:00.000", "inspection_id": "2228_20160204", "inspection_type": "Routine", "inspection_score":96, "risk_category": "Low Risk", "violation_description": "Unclean nonfood contact surfaces", "violation_id": "2228_20160204_103142" } # POST creates the document's ID for us POST /inspections/_doc { "business_address": "660 Sacramento St", "business_city": "San Francisco", "business_id": "2228", "business_latitude": "37.793698", "business_location": { "type": "Point", "coordinates": [ -122.403984, 37.793698 ] }, "business_longitude": "-122.403984", "business_name": "Tokyo Express", "business_postal_code": "94111", "business_state": "CA", "inspection_date": "2016-02-04T00:00:00.000", "inspection_id": "2228_20160204", "inspection_type": "Routine", "inspection_score":96, "risk_category": "Low Risk", "violation_description": "Unclean nonfood contact surfaces", "violation_id": "2228_20160204_103142" } # We can also specify it with PUT PUT /inspections/_doc/12345 { "business_address": "660 Sacramento St", "business_city": "San Francisco", "business_id": "2228", "business_latitude": "37.793698", "business_location": { "type": "Point", "coordinates": [ -122.403984, 37.793698 ] }, "business_longitude": "-122.403984", "business_name": "Tokyo Express", "business_postal_code": "94111", "business_state": "CA", "inspection_date": "2016-02-04T00:00:00.000", "inspection_id": "2228_20160204", "inspection_type": "Routine", "inspection_score":96, "risk_category": "Low Risk", "violation_description": "Unclean nonfood contact surfaces", "violation_id": "2228_20160204_103142" } # Indexing the document automatically created the index for us, named "inspection" # The document is of type "report" (POST /inspection/report) # It is recommeneded to store only one type per index, as multiple types per index will not be supported in the future # Instead of dynamically creating the index based on the first document we add, we can create the index beforehand, to set certain settings DELETE /inspections PUT /inspections { "settings": { "index.number_of_shards": 1, "index.number_of_replicas": 0 } } # We'll use 1 shard for this example, and no replicas, we probably wouldn't want to do this in production # When you need to index a lot of docs, you should use the bulk API, you may see signficant performance benefits POST /inspections/_doc/_bulk { "index": { "_id": 1 }} {"business_address":"315 California St","business_city":"San Francisco","business_id":"24936","business_latitude":"37.793199","business_location":{"type":"Point","coordinates":[-122.400152,37.793199]},"business_longitude":"-122.400152","business_name":"San Francisco Soup Company","business_postal_code":"94104","business_state":"CA","inspection_date":"2016-06-09T00:00:00.000","inspection_id":"24936_20160609","inspection_score":77,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Improper food labeling or menu misrepresentation","violation_id":"24936_20160609_103141"} { "index": { "_id": 2 }} {"business_address":"10 Mason St","business_city":"San Francisco","business_id":"60354","business_latitude":"37.783527","business_location":{"type":"Point","coordinates":[-122.409061,37.783527]},"business_longitude":"-122.409061","business_name":"Soup Unlimited","business_postal_code":"94102","business_state":"CA","inspection_date":"2016-11-23T00:00:00.000","inspection_id":"60354_20161123","inspection_type":"Routine", "inspection_score": 95} { "index": { "_id": 3 }} {"business_address":"2872 24th St","business_city":"San Francisco","business_id":"1797","business_latitude":"37.752807","business_location":{"type":"Point","coordinates":[-122.409752,37.752807]},"business_longitude":"-122.409752","business_name":"TIO CHILOS GRILL","business_postal_code":"94110","business_state":"CA","inspection_date":"2016-07-05T00:00:00.000","inspection_id":"1797_20160705","inspection_score":90,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unclean nonfood contact surfaces","violation_id":"1797_20160705_103142"} { "index": { "_id": 4 }} {"business_address":"1661 Tennessee St Suite 3B","business_city":"San Francisco Whard Restaurant","business_id":"66198","business_latitude":"37.75072","business_location":{"type":"Point","coordinates":[-122.388478,37.75072]},"business_longitude":"-122.388478","business_name":"San Francisco Restaurant","business_postal_code":"94107","business_state":"CA","inspection_date":"2016-05-27T00:00:00.000","inspection_id":"66198_20160527","inspection_type":"Routine","inspection_score":56 } { "index": { "_id": 5 }} {"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"type":"Point","coordinates":[-122.481299,37.747228]},"business_longitude":"-122.481299","business_name":"Soup House","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"} { "index": { "_id": 6 }} {"business_address":"2162 24th Ave","business_city":"San Francisco","business_id":"5794","business_latitude":"37.747228","business_location":{"type":"Point","coordinates":[-122.481299,37.747228]},"business_longitude":"-122.481299","business_name":"Soup-or-Salad","business_phone_number":"+14155752700","business_postal_code":"94116","business_state":"CA","inspection_date":"2016-09-07T00:00:00.000","inspection_id":"5794_20160907","inspection_score":96,"inspection_type":"Routine - Unscheduled","risk_category":"Low Risk","violation_description":"Unapproved or unmaintained equipment or utensils","violation_id":"5794_20160907_103144"} # More info: https://www.elastic.co/guide/en/elasticsearch/guide/current/bulk.html #__________________________________________________ # Let's go back to executing our basic search # Find *all* documents GET /inspections/_doc/_search #__________________________________________________ # Let's find all inspection reports for places that sell soup GET /inspections/_doc/_search { "query": { "match": { "business_name": "soup" } } } # Let's look for restaurants with the name San Francisco # Since San Francisco is two words, we'll use match_phrase GET /inspections/_doc/_search { "query": { "match_phrase": { "business_name": "san francisco" } } } # Results are ranked by "relevance" (_score) # Let's look again GET /inspections/_doc/_search { "query": { "match": { "business_name": "soup" } } } # More info: https://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-intro.html #__________________________________________________ # We can also do boolean combinations of queries # Let's find all docs with "soup" and "san francisco" in the business name GET /inspections/_doc/_search { "query": { "bool": { "must": [ { "match": { "business_name": "soup" } }, { "match_phrase": { "business_name": "san francisco" } } ] } } } # # Or negate parts of a query, businesses without "soup" in the name (maybe you hate soup) GET /inspections/_doc/_search { "query": { "bool": { "must_not": [ { "match": { "business_name": "soup" } } ] } } } #__________________________________________________ # Combinations can be boosted for different effects # Let's emphasize places with "soup in the name" GET /inspections/_doc/_search { "query": { "bool": { "should": [ { "match_phrase": { "business_name": { "query": "soup", "boost" : 3 } } }, { "match_phrase": { "business_name": { "query": "san francisco" } } } ] } } } # Sometimes it's unclear what actually matched. # We can highlight the matching fragments: GET /inspections/_doc/_search { "query" : { "match": { "business_name": "soup" } }, "highlight": { "fields": { "business_name": {} } } } #__________________________________________________ # Finally, we can perform filtering, when we don't need text analysis (or need to do exact matches, range queries, etc.) # Let's find soup companies with a health score greater than 80 GET /inspections/_doc/_search { "query": { "range": { "inspection_score": { "gte": 80 } } }, "sort": [ { "inspection_score" : "desc" } ] } # More info: https://www.elastic.co/guide/en/elasticsearch/guide/current/structured-search.html # We can also sort our results by "inspection_score" # Sample SQL Query with Elasticsearch POST /_xpack/sql?format=txt { "query": "SELECT business_name, inspection_score FROM inspections ORDER BY inspection_score DESC LIMIT 5" } # Multiple methods to query Elasticsearch with SQL # - Through the rest endpoints (as seen above) # - Through the included CLI tool in the /bin directory of Elasticsearch # - JDBC Elasticsearch client # More details can be found here: https://www.elastic.co/guide/en/elasticsearch/reference/6.3/xpack-sql.html # Aggregations (one use case is faceting data) are very interesting # We won't have time to cover aggregation in depth now, but we want to get you familiar with # how they work, so you can use them on your own # Let's search for the term "soup", and bucket results by health score (similar to the facets you would see in an ebay site) # Show: https://www.ebay.com/sch/i.html?_from=R40&_trksid=p2380057.m570.l1313.TR12.TRC2.A0.H0.Xwatch.TRS0&_nkw=watch&_sacat=0 GET /inspections/_doc/_search { "query": { "match": { "business_name": "soup" } } ,"aggregations" : { "inspection_score" : { "range" : { "field" : "inspection_score", "ranges" : [ { "key" : "0-80", "from" : 0, "to" : 80 }, { "key" : "81-90", "from" : 81, "to" : 90 }, { "key" : "91-100", "from" : 91, "to" : 100 } ] } } } } # Geo search is another powerful tool for search # Let's find soup restaurants closest to us! # We have the geo point within the document, let's use it GET /inspections/_doc/_search # Let's execute the follow geo query, to sorted restaurants by distance by us GET /inspections/_doc/_search { "query": { "match": { "business_name": "soup"} }, "sort": [ { "_geo_distance": { "coordinates": { "lat": 37.783527, "lon": -122.409061 }, "order": "asc", "unit": "km" } } ] } # Error! Elasticsearch doesn't know the field is a geopoint # We must define this field as a geo point using mappings # Mapping are helpful for defining the structure of our document, and more efficiently storing/searching the data within our index # We have numbers/dates/strings, and geopoints, let's see what elasticsearch thinks our mapping is GET /inspections/_mapping/_doc # Let's change the mapping, delete our index, and perform our bulk import again # In production scenarios, you may prefer to use the reindex API, you can add new mapping fields without needing to migrate the data DELETE inspections PUT /inspections PUT inspections/_mapping/_doc { "properties": { "business_address": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_city": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_latitude": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "coordinates": { "type": "geo_point" }, "business_longitude": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_name": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_phone_number": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_postal_code": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "business_state": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "inspection_date": { "type": "date" }, "inspection_id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "inspection_score": { "type": "long" }, "inspection_type": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "risk_category": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "violation_description": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } }, "violation_id": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } } } } } # Now we can execute our original geo query GET /inspections/_doc/_search { "query": { "match": { "business_name": "soup"} }, "sort": [ { "_geo_distance": { "business_location": { "lat": 37.800175, "lon": -122.409081 }, "order": "asc", "unit": "km", "distance_type": "plane" } } ] } # That was a very short introduction to geo queries and mappings, the goal was to get your feet wet to hopefuly go off and learn more # Let's finish the CRUD components, we covered C, and R, let's show show to update and delete documents # Let's add a flagged field to one of our documents, using a partial document update GET /inspections/_doc/_search POST /inspections/_doc/5/_update { "doc" : { "flagged" : true, "views": 0 } } # To delete a document, we can just pass the document id to the DELETE API DELETE /inspections/_doc/5 # That completed the CRUD section # - Analyzers # Text analysis is core to Elasticsearch, and very important to understand # As you saw a mapping configuration for data types in the previous example, you can also configure an analyzer per field or an entire index! # Analysis = tokenization + token filters # Tokenization breaks sentences into discrete tokens GET /inspections/_analyze { "tokenizer": "standard", "text": "my email address test123@company.com" } GET /inspections/_analyze { "tokenizer": "whitespace", "text": "my email address test123@company.com" } GET /inspections/_analyze { "tokenizer": "standard", "text": "Brown fox brown dog" } # And filters manipulate those tokens GET /inspections/_analyze { "tokenizer": "standard", "filter": ["lowercase"], "text": "Brown fox brown dog" } # There is a wide variety of filters. GET /inspections/_analyze { "tokenizer": "standard", "filter": ["lowercase", "unique"], "text": "Brown brown brown fox brown dog" } # More info: https://www.elastic.co/guide/en/elasticsearch/guide/current/_controlling_analysis.html