Re: Geode - store and query JSON documents

Xiaojian Zhou Thu, 26 Nov 2020 16:18:55 -0800

Ankit:

I wrote some lucene sample code using your data and query.


I also provided gfsh commands to create nested query. 

Note: I purposely provided 2 data to show the difference of query. 

package examples;

import org.apache.geode.cache.Region;
import org.apache.geode.cache.client.ClientCache;
import org.apache.geode.cache.client.ClientCacheFactory;
import org.apache.geode.cache.client.ClientRegionShortcut;
import org.apache.geode.cache.lucene.LuceneQuery;
import org.apache.geode.cache.lucene.LuceneQueryException;
import org.apache.geode.cache.lucene.LuceneServiceProvider;
import org.apache.geode.cache.lucene.PageableLuceneQueryResults;
import org.apache.geode.cache.lucene.internal.LuceneIndexImpl;
import org.apache.geode.cache.lucene.internal.LuceneServiceImpl;
import org.apache.geode.pdx.JSONFormatter;
import org.apache.geode.pdx.PdxInstance;

import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

public class JSONTest {
  //NOTE: Below is truncated json, single json document can max contain an 
array of col1...col30 (30 diff attributes)
  // within data.
  public final static String jsonDoc_2 = "{" +
          "\"data\":[{" +
          "\"col1\": {" +
          "\"k11\": \"aaa\"," +
          "\"k12\":true," +
          "\"k13\": 1111," +
          "\"k14\": \"2020-12-31:00:00:00\"" +
          "}," +
          "\"col2\":[{" +
          "\"k21\": \"222222\"," +
          "\"k22\": true" +
          "}]" +
          "}]" +
          "}";
  public final static String jsonDoc_3 = "{" +
          "\"data\":[{" +
          "\"col1\": {" +
          "\"k11\": \"bbb\"," +
          "\"k12\":true," +
          "\"k13\": 1111," +
          "\"k14\": \"2020-12-31:00:00:00\"" +
          "}," +
          "\"col2\":[{" +
          "\"k21\": \"333333\"," +
          "\"k22\": true" +
          "}]" +
          "}]" +
          "}";

  //NOTE: Col1....col30 are mix of JSONObject ({}) and JSONArray  ([]) as shown 
above in jsonDoc_2;

  public final static String REGION_NAME = "REGION_NAME";

  public static void main(String[] args) throws InterruptedException, 
LuceneQueryException {

    //create client-cache
    ClientCache cache = new
            ClientCacheFactory().addPoolLocator("localhost", 
10334).setPdxReadSerialized(true).create();
    Region<String, PdxInstance> region = cache.<String,
            
PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
            .create(REGION_NAME);

    //store json document
    region.put("key", JSONFormatter.fromJSON(jsonDoc_2));
    region.put("key3", JSONFormatter.fromJSON(jsonDoc_3));

    LuceneServiceImpl service = (LuceneServiceImpl) 
LuceneServiceProvider.get(cache);
    LuceneIndexImpl index = (LuceneIndexImpl) service.getIndex("jsonIndex", 
"REGION_NAME");
    if (index != null) {
      service.waitUntilFlushed("jsonIndex", "REGION_NAME", 60000, 
TimeUnit.MILLISECONDS);
    }

    LuceneQuery query = service.createLuceneQueryFactory().create("jsonIndex", 
"REGION_NAME",
            "222222 OR 333333", "data.col2.k21");
    System.out.println("Query 222222 OR 333333");
    HashSet results = getResults(query, "REGION_NAME");

    LuceneQuery query2 = service.createLuceneQueryFactory().create("jsonIndex", 
"REGION_NAME",
            "aaa OR xxx OR yyy", "data.col1.k11");
    System.out.println("Query aaa OR xxx OR yyy");
    results = getResults(query2, "REGION_NAME");

    // server side:
    // gfsh> start locator
    // gfsh> start server --name=server50505 --server-port=50505
    // gfsh> create lucene index --name=jsonIndex --region=/REGION_NAME 
--field=data.col2.k21,data.col1.k11
    // --serializer=org.apache.geode.cache.lucene.FlatFormatSerializer
    // gfsh> create region --name=REGION_NAME --type=PARTITION 
--redundant-copies=1 --total-num-buckets=61

    // How to query json document like,

    // 1. select col2.k21, col1, col20 from /REGION_NAME where
    //data.col2.k21 = '222222' OR data.col2.k21 = '333333'

    // 2. select col2.k21, col1.k11, col1 from /REGION_NAME where
    // data.col1.k11 in ('aaa', 'xxx', 'yyy')
  }

  private static HashSet getResults(LuceneQuery query, String regionName) 
throws LuceneQueryException {
    if (query == null) {
      return null;
    }

    PageableLuceneQueryResults<Object, Object> results = query.findPages();
    if (results.size() > 0) {
      System.out.println("Search found " + results.size() + " results in " + 
regionName + ", page size is " + query.getPageSize());
    }

    HashSet values = new HashSet<>();
    while (results.hasNext()) {
      results.next().stream()
              .forEach(struct -> {
                Object value = struct.getValue();
                if (value instanceof PdxInstance) {
                  PdxInstance pdx = (PdxInstance) value;
                  String jsonString = JSONFormatter.toJSON(pdx);
                  List<PdxInstance> dataList = 
(LinkedList<PdxInstance>)pdx.getField("data");
                  for (PdxInstance data:dataList) {
                    Object colObject = ((PdxInstance)data).getField("col1");
                    System.out.println("col="+colObject);
                  }
                  System.out.println("Found a json object:" + 
jsonString+":dataList="+dataList);
                  values.add(pdx);
                } else {
                  System.out.println("key=" + struct.getKey() + ",data=" + 
value);
                  values.add(value);
                }
              });
    }
    System.out.println("Search found " + values.size() + " results in " + 
regionName);
    return values;
  }
}

Regards
Xiaojian

On 11/23/20, 9:46 AM, "Xiaojian Zhou" <[email protected]> wrote:

    Ankit:

    Anil can provide you some sample code of OQL query on JSON.

    I will find some lucene sample code on JSON for you. 

    Regards
    Xiaojian

    On 11/23/20, 9:27 AM, "ankit Soni" <[email protected]> wrote:

        Hi
        I am looking for any means of querying (OQL/Lucene/API etc..?) this 
stored
        data. Looking for achieving this functionality first and second, in a
        performant way.

        I shared the OQL like syntax, to share my use-case easily and based on 
some
        reference found on doc. I am ok if a Lucene query or some other way can
        fetch the results.

        It will be of great help if you share the sample query/code fetching 
this
        data .

        Thanks
        Ankit.


        On Mon, 23 Nov 2020 at 22:43, Xiaojian Zhou <[email protected]> wrote:

        > Anil:
        >
        > The syntax is OQL. But I understand they want to query JSON object 
base on
        > the criteria.
        >
        > On 11/23/20, 9:08 AM, "Anilkumar Gingade" <[email protected]> wrote:
        >
        >     Gester, Looking at the sample query, I Believe Ankit is asking 
about
        > OQL query not Lucene...
        >
        >     -Anil.
        >
        >
        >     On 11/23/20, 9:02 AM, "Xiaojian Zhou" <[email protected]> wrote:
        >
        >         Ankit:
        >
        >         Geode provided lucene query on json field. Your query can be
        > supported.
        >
        > 
https://nam04.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgemfire.docs.pivotal.io%2F910%2Fgeode%2Ftools_modules%2Flucene_integration.html&amp;data=04%7C01%7Czhouxh%40vmware.com%7Cd1fa267b22a14927924108d88fd7b941%7Cb39138ca3cee4b4aa4d6cd83d9dd62f0%7C0%7C0%7C637417503982299596%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=CWGBZA0ZgNJi7XST88NM8iJhR360sdLiG5LXCeECrO4%3D&amp;reserved=0
        >
        >         However in above document, it did not provided a query 
example on
        > JSON object.
        >
        >         I can give you some sample code to query on JSON.
        >
        >         Regards
        >         Xiaojian Zhou
        >
        >         On 11/22/20, 11:53 AM, "ankit Soni" 
<[email protected]>
        > wrote:
        >
        >             Hello geode-devs, please provide a guidance on this.
        >
        >             Ankit.
        >
        >             On Sat, 21 Nov 2020 at 10:23, ankit Soni <
        > [email protected]> wrote:
        >
        >             > Hello team,
        >             >
        >             > I am *evaluating usage of Geode (1.12) with storing JSON
        > documents and
        >             > querying the same*. I am able to store the json records
        > successfully in
        >             > geode but seeking guidance on how to query them.
        >             > More details on code and sample json is,
        >             >
        >             >
        >             > *Sample client-code*
        >             >
        >             > import org.apache.geode.cache.client.ClientCache;
        >             > import org.apache.geode.cache.client.ClientCacheFactory;
        >             > import 
org.apache.geode.cache.client.ClientRegionShortcut;
        >             > import org.apache.geode.pdx.JSONFormatter;
        >             > import org.apache.geode.pdx.PdxInstance;
        >             >
        >             > public class MyTest {
        >             >
        >             >     *//NOTE: Below is truncated json, single json 
document
        > can max contain an array of col1...col30 (30 diff attributes) within 
data. *
        >             >     public final static  String jsonDoc_2 = "{" +
        >             >             "\"data\":[{" +
        >             >                         "\"col1\": {" +
        >             >                                 "\"k11\": \"aaa\"," +
        >             >                                 "\"k12\":true," +
        >             >                                 "\"k13\": 1111," +
        >             >                                 "\"k14\":
        > \"2020-12-31:00:00:00\"" +
        >             >                                 "}," +
        >             >                         "\"col2\":[{" +
        >             >                                 "\"k21\": \"222222\"," +
        >             >                                 "\"k22\": true" +
        >             >                                 "}]" +
        >             >                     "}]" +
        >             >             "}";
        >             >
        >             > *     //NOTE: Col1....col30 are mix of JSONObject ({}) 
and
        > JSONArray ([]) as shown above in jsonDoc_2;*
        >             >
        >             >     public static void main(String[] args){
        >             >
        >             >         //create client-cache
        >             >         ClientCache cache = new
        > ClientCacheFactory().addPoolLocator(LOCATOR_HOST, PORT).create();
        >             >         Region<String, PdxInstance> region = 
cache.<String,
        > 
PdxInstance>createClientRegionFactory(ClientRegionShortcut.CACHING_PROXY)
        >             >                 .create(REGION_NAME);
        >             >
        >             >         //store json document
        >             >         region.put("key", 
JSONFormatter.fromJSON(jsonDoc_2));
        >             >
        >             >         //How to query json document like,
        >             >
        >             >         // 1. select col2.k21, col1, col20 from 
/REGION_NAME
        > where data.col2.k21 = '222222' OR data.col2.k21 = '333333'
        >             >
        >             >         // 2. select col2.k21, col1.k11, col1 from
        > /REGION_NAME where data.col1.k11 in ('aaa', 'xxx', 'yyy')
        >             >     }
        >             > }
        >             >
        >             > *Server: Region-creation*
        >             >
        >             > gfsh> create region --name=REGION_NAME --type=PARTITION
        > --redundant-copies=1 --total-num-buckets=61
        >             >
        >             >
        >             > *Setup: Distributed cluster of 3 nodes
        >             > *
        >             >
        >             > *My Observations/Problems*
        >             > -  Put operation takes excessive time: region.put("key",
        >             > JSONFormatter.fromJSON(jsonDoc_2));  - Fetching a single
        > record from () a
        >             > file and Storing in geode approx. takes . 3 secs
        >             >    Is there any suggestions/configuration related to
        > JSONFormatter API or
        >             > other to optimize this...?
        >             >
        >             > *Looking forward to guidance on querying this JOSN for 
above
        > sample
        >             > queries.*
        >             >
        >             > *Thanks*
        >             > *Ankit*
        >             >
        >
        >
        >
        >

Re: Geode - store and query JSON documents

Reply via email to