Difference between revisions of "Social Networking Data Discovery"

From Gcube Wiki
Jump to: navigation, search
(Design)
(Design)
Line 143: Line 143:
 
To this aim, we need to specify the document's schema of the objects we are going to push to ElasticSearch. There we underline which fields must be indexed (and how must be analyzed) and the ones won't be analyzed at all, thus won't be indexed/searchable. The schema is in JSON format and resembles the Java classes reported so far.
 
To this aim, we need to specify the document's schema of the objects we are going to push to ElasticSearch. There we underline which fields must be indexed (and how must be analyzed) and the ones won't be analyzed at all, thus won't be indexed/searchable. The schema is in JSON format and resembles the Java classes reported so far.
  
A simple "mapping" schema for our document could be
+
A simple "mapping" schema for our document is
  
 
<source lang="xml">
 
<source lang="xml">
// Document schema
+
//Documentschema
  "mappings": {
+
"mappings": {
    "enhanced_feeds": {
+
  "enhanced_feeds": {
      "_timestamp": {
+
    "_timestamp": {
        "enabled": true, // enable the create/update timestamp and make it retrievable
+
      "enabled": true,//enablethecreate/updatetimestampandmakeitretrievable
        "ignore_missing": false
+
      "ignore_missing": false
 +
    },
 +
    "properties": {
 +
      "attachments": {
 +
        "properties": {
 +
          "description": {
 +
            "type": "string",
 +
            "index": "no"
 +
          },
 +
          "key": {
 +
            "type": "string",
 +
            "index": "no"
 +
          },
 +
          "mimeType": {
 +
            "type": "string",
 +
            "index": "no"
 +
          },
 +
          "name": {
 +
            "type": "string",
 +
            "analyzer": "filename_index",
 +
            "search_analyzer": "filename_search"
 +
          },
 +
          "thumbnailURL": {
 +
            "type": "string",
 +
            "index": "no"
 +
          },
 +
          "uri": {
 +
            "type": "string",
 +
            "index": "no"
 +
          }
 +
        }
 
       },
 
       },
       "properties": {
+
       "comments": {
         "attachments": {
+
         "properties": {
           "properties": {
+
           "author": {
             "description": {
+
             "type": "string",
              "type": "string",
+
            "analyzer": "author_analyzer"
              "index": "no"
+
          },
            },
+
          "key": {
            "key": {
+
            "type": "string",
              "type": "string",
+
            "index": "no"
              "index": "no"
+
          },
            },
+
          "lastEditTime": {
            "mimeType": {
+
            "type": "long",
              "type": "string",
+
            "index": "no"
              "index": "no"
+
          },
            },
+
          "description": {
            "name": {
+
            "type": "string",
              "type": "string",
+
            "analyzer": "feed_comment_text_index_analyzer",
              "analyzer": "filename_index",
+
            "search_analyzer": "feed_comment_text_search_analyzer"
              "search_analyzer": "filename_search"
+
          },
            },
+
          "thumbnailURL": {
            "thumbnailURL": {
+
            "type": "string",
              "type": "string",
+
            "index": "no"
              "index": "no"
+
          },
            },
+
          "creationTime": {
            "uri": {
+
            "type": "long",
              "type": "string",
+
            "index": "no"
              "index": "no"
+
          },
            }
+
          "authorEmail": {
 +
            "type": "string",
 +
            "index": "no"
 
           }
 
           }
         },
+
         }
        "comments": {
+
      },
          "properties": {
+
      "post": {
            "author": {
+
        "properties": {
              "type": "string",
+
          "commentsNo": {
              "analyzer": "author_analyzer"
+
             "type": "long",
            },
+
             "index": "no"
            "key": {
+
           },
              "type": "string",
+
           "description": {
              "index": "no"
+
             "type": "string",
            },
+
             "analyzer": "feed_comment_text_index_analyzer",
            "lastEditTime": {
+
            "search_analyzer": "feed_comment_text_search_analyzer"
              "type": "long",
+
          },
              "index": "no"
+
          "authorEmail": {
            },
+
            "type": "string",
            "description": {
+
            "index": "no"
              "type": "string",
+
          },
              "analyzer": "feed_comment_text_index_analyzer",
+
          "author": {
              "search_analyzer": "feed_comment_text_search_analyzer"
+
            "type": "string",
             },
+
            "analyzer": "author_analyzer"
            "thumbnailURL": {
+
          },
              "type": "string",
+
          "key": {
              "index": "no"
+
            "type": "string",
            },
+
            "index": "no"
            "creationTime": {
+
          },
              "type": "long",
+
          "likesNo": {
              "index": "no"
+
            "type": "long",
             },
+
            "index": "no"
            "authorEmail": {
+
          },
              "type": "string",
+
          "privacy": {
              "index": "no"
+
            "type": "string",
            }
+
            "index": "no"
           }
+
          },
        },
+
          "thumbnailURL": {
        "post": {
+
            "type": "string",
           "properties": {
+
            "index": "no"
             "commentsNo": {
+
          },
              "type": "long",
+
          "time": {
              "index": "no"
+
            "type": "long",
             },
+
            "index": "no"
            "description": {
+
          },
              "type": "string",
+
          "context": {
              "analyzer": "feed_comment_text_index_analyzer",
+
            "type": "string",
              "search_analyzer": "feed_comment_text_search_analyzer"
+
            "index": "not_analyzed"
            },
+
            "authorEmail": {
+
              "type": "string",
+
              "index": "no"
+
            },
+
            "author": {
+
              "type": "string",
+
              "analyzer": "author_analyzer"
+
            },
+
            "key": {
+
              "type": "string",
+
              "index": "no"
+
            },
+
            "likesNo": {
+
              "type": "long",
+
              "index": "no"
+
            },
+
            "privacy": {
+
              "type": "string",
+
              "index": "no"
+
            },
+
            "thumbnailURL": {
+
              "type": "string",
+
              "index": "no"
+
            },
+
            "time": {
+
              "type": "long",
+
              "index": "no"
+
            },
+
            "context": {
+
              "type": "string",
+
              "index": "not_analyzed"
+
            }
+
 
           }
 
           }
 
         }
 
         }
 
       }
 
       }
 
     }
 
     }
 +
  }
 
</source>
 
</source>
  
 +
Please note, for example, that the field "context" within the post class will be not analyzed. This means that the search engine will not perform any preprocessing for this string object but all the needed structure to retrieve it will be created. Fields that won't be analyzed at all looks like
  
 +
<source lang="xml">
 +
"field_name":{
 +
  "type": "field_type",
 +
  "index": "no"
 +
}
 +
</source>
  
 +
As far as the remaining fields is concerned, we need to specify the analyzers that the engine will apply to preprocess them.
  
 
=== Architecture ===
 
=== Architecture ===

Revision as of 11:43, 1 August 2016

Overview

The purpose of this document is to show how the search facility over the D4Science Infrastructure Social Networking data, primarily stored into a Cassandra Cluster, has been realized. Cassandra is an highly scalable and distributable database, used by a lot of companies around the world (eBay, Netflix, Instagram and many more). It offers highly availability by means of data sharding and replications.

The engine that enables the full-text search is ElasticSearch. ElasticSearch is an highly scalable, distributable, open source full text search and analytics engine based on the famous Apache-Lucene software library. It runs on one or more cluster nodes and is reachable over http(s). It allows to organize documents in one or more indexes according to their schema. This schema can be defined in JSON format, even tought Elastic tries to automatically detect it. Elasticsearch is a near real time search platform. What this means is there is a slight latency (normally one second) from the time you index a document until the time it becomes searchable.

The glue between Cassandra and ElasticSearch is a SmartExecutor plugin, namely the social-data-indexer plugin. In the following we are going to investigate which roles it have.

The main goal of the search facility is to let the users quickly search over this potentially huge amount of data, taking into account the data they are allowed to access. In fact, D4Science is a Research Infrastructure that offers many Virtual Research Environments (VREs). A user is allowed to see only the data of the VREs in which she is present. In order to do that, a client library has been created. It receives the query submitted by the user and returns the list of feeds belonging to the user's VREs, if any, sorted according to the scores they reached.

The Social Networking Library

The gCube Social Networking Library is the bridge between gCube Applications and the social networking facilities. The library discovers the Cassandra Cluster in the infrastructure and offers a lot of methods, such as post creation/deletion, comment creation/deletion, notifications generation and so on. All the information about the library can be retrieved here. As far as the search mechanism is concerned, the library is used to fetch data from the NoSql Cassandra cluster and to build up enhanced feeds. The concept of enhanced feed will be shown later.

Smart Executor

The SmartExecutor service allows to execute "gCube Tasks" and monitor their execution status. Each instance of the SmartExecutor service can run the "gCube Tasks" related to the plugins available on such an instance. Each instance of the SmartExecutor service publishes descriptive information about the co-deployed plugins. Clients may interact with the SmartExecutor service through a library (SmartExecutor Client) of high-level facilities to simplify the discovery of available plugins in those instances. Each client can request to execute a "gCube Tasks" or getting informations about the state of their execution. More information are available here. For our purposes, it will take care of the lifecycle of the social-data-indexer plugin.

Key features

In order to understand which are the key features of the Social Networking Data Discovery facilities, we need to understand which are the data that Cassandra stores and how we can help users to quickly retrieve information.

These data are mainly:

  • Users' posts;
  • Users' comments, and
  • User's attachments metadata (the payload of such attachments is stored into a different database)

Of course, a lot of other information needs to be saved to offer a large set of social facilities, such as post notifications, comment notifications, messages exchange and so on. But these are not of our interest.

The full-text search mainly focuses on the data cited above. In principle, a single user's post could be composed by the following elements:

  • Post's text, that is the initial content of the post [mandatory];
  • Post's author, that is the fullname of who made the post [mandatory];
  • zero or more comments to the post, so zero or more comments' texts and zero or more comments' authors;
  • zero or more attachments (pdfs, images, cvs and so on);
  • a VRE within the infrastructure in which the post/comments were published [mandatory].

Users can only access and see the data of the VREs to which they registered.

A post with the related text, comments, authors and attachments will be called an enhanced feed.

Up to now, a user can:

  • retrieve feeds by author (of both post/comments);
  • retrieve feeds by content (of both post/comments);
  • retrieve feeds by attachments' names.

In fact, these are the current feed's fields that are discoverable even tough there is not an "advanced" search that allows to specify which one of these fields must be queried. This aspect will be better discussed later.

In the following "Use Cases" paragraph, we are going to discuss each scenario.

Use cases

As stated above, the search facility allows to retrieve a feed when the query matched at least one among post's content, post's author, comments' contents, comments' authors or attachments' names field. There is not yet an advanced search mechanism that allows to specify the field we are to match. This means that the approach that used is to make a MultiMatchQuery, that is the final score of a feed is evaluated as if the query were matched against each field of this document to evaluate a partial score. These partial scores are than (mostly) summed up to get a final score for the document w.r.t. the query (so we are using a MultiMatch most_fields query). The most_fields type of multimatch query makes sense when we are querying different document's fields analyzed in different ways, as it is our case. How the document is analyzed will be shown later.

A list of possible use cases is the following:

  1. Search all feeds whose author is a specified user, e.g "Andrea Rossi": in this case the fullname "Andrea Rossi" is inserted into the search textbox;
  2. Search all feeds in which there are comments written by a specified user, e.g "Andrea Rossi": in this case the fullname "Andrea Rossi" is inserted into the search textbox;
  3. Suppose that a user remembers to have seen a feed with a file attached having name "report.pdf": in this case she can insert "report" or "report.pdf" into the search textbox;
  4. search for all the feeds with a .doc document attached: in this case ".doc" is inserted into the search textbox;
  5. search for a specific topic, e.g. "social data indexing": in this case she inserts "social", "social data", "social indexing" or the other possible combinations to look up the feeds.

Design

There are different actors needed to make the search feature work (see the Architecture paragraph). We have already discussed about Cassandra, but there is something more to say about how ElasticSearch has been prepared and how the Social-Data-Indexer plugin acts.

Specifically, we need to tell to the engine which are the fields of the document that need to be indexed (so that it will create the inverted indexes and all the related structures to speed up the query and the retrieval phases) and how we want to analyze them at indexing time, so when a document is inserted or updated, and at query time, so that the query is analyzed for that specified field. This is an important aspect because it influences the accuracy of the results.

An enhanced feed (simplified) can be represented by the following java class

/**
 * Enhanced feed class.
 */
public class EnhancedFeed{
 
     private Post post;
     private List<Comment> comments;
     private List<Attachment> attachments;
 
....
}

Where a Post object looks like

/**
 * Post class.
 */
public class Post {
 
     private String key;
     private Date time;
     private String description;
     private String author;
     private String authorEmail;
     private long commentsNo;
     private long likesNo;
     private String context;
     private String privacy;
....
}

A comment looks like

/**
 * Comment class.
 */
public class Comment {
 
     private String key;
     private Date creationTime;
     private Date lastEditTime;
     private String description;
     private String author;
     private String authorEmail;
....
}

And, finally, an Attachment object looks like

/**
 * Attachment class.
 */
public class Attachment {
 
     private String key;
     private String uri;
     private String name;
     private String description;
     private String thumbnailURL;
     private String mimeType;
....
}

Now, we are mainly interested in making the following fields searchable:

  1. Post's description;
  2. Post's author;
  3. Comment's description;
  4. Comment's author;
  5. Feed's context and
  6. Attachment's name.

To this aim, we need to specify the document's schema of the objects we are going to push to ElasticSearch. There we underline which fields must be indexed (and how must be analyzed) and the ones won't be analyzed at all, thus won't be indexed/searchable. The schema is in JSON format and resembles the Java classes reported so far.

A simple "mapping" schema for our document is

//Documentschema
"mappings": {
  "enhanced_feeds": {
    "_timestamp": {
      "enabled": true,//enablethecreate/updatetimestampandmakeitretrievable
      "ignore_missing": false
    },
    "properties": {
      "attachments": {
        "properties": {
          "description": {
            "type": "string",
            "index": "no"
          },
          "key": {
            "type": "string",
            "index": "no"
          },
          "mimeType": {
            "type": "string",
            "index": "no"
          },
          "name": {
            "type": "string",
            "analyzer": "filename_index",
            "search_analyzer": "filename_search"
          },
          "thumbnailURL": {
            "type": "string",
            "index": "no"
          },
          "uri": {
            "type": "string",
            "index": "no"
          }
        }
      },
      "comments": {
        "properties": {
          "author": {
            "type": "string",
            "analyzer": "author_analyzer"
          },
          "key": {
            "type": "string",
            "index": "no"
          },
          "lastEditTime": {
            "type": "long",
            "index": "no"
          },
          "description": {
            "type": "string",
            "analyzer": "feed_comment_text_index_analyzer",
            "search_analyzer": "feed_comment_text_search_analyzer"
          },
          "thumbnailURL": {
            "type": "string",
            "index": "no"
          },
          "creationTime": {
            "type": "long",
            "index": "no"
          },
          "authorEmail": {
            "type": "string",
            "index": "no"
          }
        }
      },
      "post": {
        "properties": {
          "commentsNo": {
            "type": "long",
            "index": "no"
          },
          "description": {
            "type": "string",
            "analyzer": "feed_comment_text_index_analyzer",
            "search_analyzer": "feed_comment_text_search_analyzer"
          },
          "authorEmail": {
            "type": "string",
            "index": "no"
          },
          "author": {
            "type": "string",
            "analyzer": "author_analyzer"
          },
          "key": {
            "type": "string",
            "index": "no"
          },
          "likesNo": {
            "type": "long",
            "index": "no"
          },
          "privacy": {
            "type": "string",
            "index": "no"
          },
          "thumbnailURL": {
            "type": "string",
            "index": "no"
          },
          "time": {
            "type": "long",
            "index": "no"
          },
          "context": {
            "type": "string",
            "index": "not_analyzed"
          }
        }
      }
    }
  }

Please note, for example, that the field "context" within the post class will be not analyzed. This means that the search engine will not perform any preprocessing for this string object but all the needed structure to retrieve it will be created. Fields that won't be analyzed at all looks like

"field_name":{
   "type": "field_type",
   "index": "no"
}

As far as the remaining fields is concerned, we need to specify the analyzers that the engine will apply to preprocess them.

Architecture

The following pictures shows how the different actors interact and which roles they have.

Figure: Overall Architecture

Philosophy

TODO

API

TODO

Usage/Examples

TODO

Deployment

TODO