Alibaba Cloud Elasticsearch: Building a “Search for Images by Image” Search Engine in Four Steps Using Vector Indexing
By Xiaosen
Released by ELK Geek
“Search by Image” is a relatively common feature in shopping guide websites, and there are many ways to implement it, such as “Hash fingerprint and Hamming distance calculation” and “feature vector and Milvus”. In actual scenarios, however, it is difficult to achieve quickness, precision, and simplicity.
Comparison of Advantages and Disadvantages of “Search by Image” Methods
Query Performance of Solution 3
Building Search Engine for “Search for Images by Image” in Four Steps
This topic describes how to install the Alibaba Cloud Elasticsearch Vector search plug-in (aliyun-knn) based on Elasticsearch Version 6.7. In addition, this plug-in can design images with 512-dimension vector features.
The aliyun-knn plug-in cannot be used for user-created Elasticsearch clusters. We recommend using the open-source Elasticsearch Version 7.x and installing the fast-elasticsearch-vector-scoring plug-in.
1) Design the Elasticsearch Index
1.1) Index Structure
1. # Create an image index
2. PUT images_v2
3. {
4. "aliases": {
5. "images": {}
6. },
7. "settings": {
8. "index.codec": "proxima",
9. "index.vector.algorithm": "hnsw",
10. "index.number_of_replicas":1,
11. "index.number_of_shards":3
12. },
13. "mappings": {
14. "_doc": {
15. "properties": {
16. "feature": {
17. "type": "proxima_vector",
18. "dim": 512
19. },
20. "relation_id": {
21. "type": "keyword"
22. },
23. "image_path": {
24. "type": "keyword"
25. }
26. }
27. }
28. }
29. }
1.2) DSL Statements
1. GET images/_search
2.
3. "query": {
4. "hnsw": {
5. "feature": {
6. "vector": [255,....255],
7. "size": 3,
8. "ef": 1
9. }
10. }
11. },
12. "from": 0,
13. "size": 20,
14. "sort": [
15. {
16. "_score": {
17. "order": "desc"
18. }
19. }
20. ],
21. "collapse": {
22. "field": "relation_id"
23. },
24. "_source": {
25. "includes": [
26. "relation_id",
27. "image_path"
28. ]
29. }
2) Extract Image Features
extract_cnn_vgg16_keras.py
1. # -*- coding: utf-8 -*-
2. # Author: yongyuan.name
3. import numpy as np
4. from numpy import linalg as LA
5. from keras.applications.vgg16 import VGG16
6. from keras.preprocessing import image
7. from keras.applications.vgg16 import preprocess_input
8. from PIL import Image, ImageFile
9. ImageFile.LOAD_TRUNCATED_IMAGES = True
10. class VGGNet:
11. def __init__(self):
12. # weights: 'imagenet'
13. # pooling: 'max' or 'avg'
14. # input_shape: (width, height, 3), width and height should >= 48
15. self.input_shape = (224, 224, 3)
16. self.weight = 'imagenet'
17. self.pooling = 'max'
18. self.model = VGG16(weights = self.weight, input_shape = (self.input_shape[0], self.input_shape[1], self.input_shape[2]), pooling = self.pooling, include_top = False)
19. self.model.predict(np.zeros((1, 224, 224 , 3)))
20. '''
21. Use vgg16 model to extract features
22. Output normalized feature vector
23. '''
24. def extract_feat(self, img_path):
25. img = image.load_img(img_path, target_size=(self.input_shape[0], self.input_shape[1]))
26. img = image.img_to_array(img)
27. img = np.expand_dims(img, axis=0)
28. img = preprocess_input(img)
29. feat = self.model.predict(img)
30. norm_feat = feat[0]/LA.norm(feat[0])
31. return norm_feat 1. # Obtain the image feature
2. from extract_cnn_vgg16_keras import VGGNet
3. model = VGGNet()
4. file_path = "./demo.jpg"
5. queryVec = model.extract_feat(file_path)
6. feature = queryVec.tolist()
3) Write Image Features in Alibaba Cloud Elasticsearch
helper.py
1. import re
2. import urllib.request
3. def strip(path):
4. """
5. Folder names need to be cleared
6. Clear the strings of illegal folder names in Windows OS
7. :param path:
8. :return:
9. """
10. path = re.sub(r'[?\\*|"<>:/]', '', str(path))
11. return path
12.
13. def getfilename(url):
14. """
15. Obtain the last file name through URL
16. :param url:
17. :return:
18. """
19. filename = url.split('/')[-1]
20. filename = strip(filename)
21. return filename
22.
23. def urllib_download(url, filename):
24. """
25. Download
26. :param url:
27. :param filename:
28. :return:
29. """
30. return urllib.request.urlretrieve(url, filename)
train.py
1. # coding=utf-8
2. import mysql.connector
3. import os
4. from helper import urllib_download, getfilename
5. from elasticsearch5 import Elasticsearch, helpers
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. mydb = mysql.connector.connect(
11. host="127.0.0.1", # Database host IP address
12. user="root", # Database username
13. passwd="123456", # Database password
14. database="images"
15. )
16. mycursor = mydb.cursor()
17. imgae_path = "./images/"
18. def get_data(page=1):
19. page_size = 20
20. offset = (page - 1) * page_size
21. sql = """
22. SELECT id, relation_id, photo FROM images LIMIT {0},{1}
23. """
24. mycursor.execute(sql.format(offset, page_size))
25. myresult = mycursor.fetchall()
26. return myresult
27.
28. def train_image_feature(myresult):
29. indexName = "images"
30. photo_path = "http://域名/{0}"
31. actions = []
32. for x in myresult:
33. id = str(x[0])
34. relation_id = x[1]
35. # photo = x[2].decode(encoding="utf-8")
36. photo = x[2]
37. full_photo = photo_path.format(photo)
38. filename = imgae_path + getfilename(full_photo)
39. if not os.path.exists(filename):
40. try:
41. urllib_download(full_photo, filename)
42. except BaseException as e:
43. print("gid:{0}的图片{1}未能下载成功".format(gid, full_photo))
44. continue
45. if not os.path.exists(filename):
46. continue
47. try:
48. feature = model.extract_feat(filename).tolist()
49. action = {
50. "_op_type": "index",
51. "_index": indexName,
52. "_type": "_doc",
53. "_id": id,
54. "_source": {
55. "relation_id": relation_id,
56. "feature": feature,
57. "image_path": photo
58. }
59. }
60. actions.append(action)
61. except BaseException as e:
62. print("id:{0}的图片{1}未能获取到特征".format(id, full_photo))
63. continue
64. # print(actions)
65. succeed_num = 0
66. for ok, response in helpers.streaming_bulk(es, actions):
67. if not ok:
68. print(ok)
69. print(response)
70. else:
71. succeed_num += 1
72. print("本次更新了{0}条数据".format(succeed_num))
73. es.indices.refresh(indexName)
74.
75. page = 1
76. while True:
77. print("当前第{0}页".format(page))
78. myresult = get_data(page=page)
79. if not myresult:
80. print("没有获取到数据了,退出")
81. break
82. train_image_feature(myresult)
83. page += 1
4) Search for Images
1. import requests
2. import json
3. import os
4. import time
5. from elasticsearch5 import Elasticsearch
6. from extract_cnn_vgg16_keras import VGGNet
7. model = VGGNet()
8. http_auth = ("elastic", "123455")
9. es = Elasticsearch("http://127.0.0.1:9200", http_auth=http_auth)
10. #Save the uploaded image
11. upload_image_path = "./runtime/"
12. upload_image = request.files.get("image")
13. upload_image_type = upload_image.content_type.split('/')[-1]
14. file_name = str(time.time())[:10] + '.' + upload_image_type
15. file_path = upload_image_path + file_name
16. upload_image.save(file_path)
17. # Calculate the feature vector of the image
18. queryVec = model.extract_feat(file_path)
19. feature = queryVec.tolist()
20. # Delete image
21. os.remove(file_path)
22. # Query in ES based on feature vector
23. body = {
24. "query": {
25. "hnsw": {
26. "feature": {
27. "vector": feature,
28. "size": 5,
29. "ef": 10
30. }
31. }
32. },
33. # "collapse": {
34. # "field": "relation_id"
35. # },
36. "_source": {"includes": ["relation_id", "image_path"]},
37. "from": 0,
38. "size": 40
39. }
40. indexName = "images"
41. res = es.search(indexName, body=body)
42. # Filter out the returned results with low scores based on your situation…after testing, results with scores no lower than 0.65 meet the requirements.
Dependent Packages
1. mysql_connector_repackaged
2. elasticsearch
3. Pillow
4. tensorflow
5. requests
6. pandas
7. Keras
8. numpy
Summary
From the user experience perspective, at the perceptible level, speed and accuracy determine whether the product inspires the feeling of “easy to use”. The simple four-step “search for images by image” search engine built with Alibaba Cloud Elasticsearch vector search (aliyun-knn) is not just “easy to use” but also features a simple and one-step operation, which is also a big advantage.