varunrai
4/2/2019 - 7:32 AM

extract_json_value.sql

-- test by do
-- select extract_json_value('{"a":["a","2"]}','(/a)')
-- common schema

delimiter //

drop function if exists json_to_xml//

create function json_to_xml(
    json_text text charset utf8
) returns text charset utf8
comment 'Transforms JSON to XML'
language SQL
deterministic
modifies sql data
sql security invoker
begin
    declare v_from, v_old_from int unsigned;
    declare v_token text;
    declare v_level int;
    declare v_state, expect_state varchar(255);
    declare _json_tokens_id int unsigned default 0;
    declare is_lvalue, is_rvalue tinyint unsigned;
    declare scope_stack text charset ascii;
    declare xml text charset utf8;
    declare xml_nodes, xml_node text charset utf8;
    
    set json_text := trim_wspace(json_text);
    
    set expect_state := 'object_begin';
    set is_lvalue := true;
    set is_rvalue := false;
    set scope_stack := '';
    set xml_nodes := '';
    set xml_node := '';
    set xml := '';
    get_token_loop: repeat 
        set v_old_from = v_from;
        call _get_json_token(json_text, v_from, v_level, v_token, 1, v_state);
        set _json_tokens_id := _json_tokens_id + 1;
        if v_state = 'whitespace' then
          iterate get_token_loop;
        end if;
        if v_level < 0 then
          return null;
          -- call throw('Negative nesting level found in _get_json_tokens');
        end if;
        if v_state = 'start' and scope_stack = '' then
          leave get_token_loop;
        end if;
        if FIND_IN_SET(v_state, expect_state) = 0 then
          return null;
          -- call throw(CONCAT('Expected ', expect_state, '. Got ', v_state));
        end if;
        if v_state = 'array_end' and left(scope_stack, 1) = 'o' then
          return null;
          -- call throw(CONCAT('Missing "}". Found ', v_state));
        end if;
        if v_state = 'object_end' and left(scope_stack, 1) = 'a' then
          return null;
          -- call throw(CONCAT('Missing "]". Found ', v_state));
        end if;
        if v_state = 'alpha' and lower(v_token) not in ('true', 'false', 'null') then
          return null;
          -- call throw(CONCAT('Unsupported literal: ', v_token));
        end if;
        set is_rvalue := false;
        case 
          when v_state = 'object_begin' then set expect_state := 'string', scope_stack := concat('o', scope_stack), is_lvalue := true;
          when v_state = 'array_begin' then set expect_state := 'string,object_begin', scope_stack := concat('a', scope_stack), is_lvalue := false;
          when v_state = 'string' and is_lvalue then set expect_state := 'colon', xml_node := v_token;
          when v_state = 'colon' then set expect_state := 'string,number,alpha,object_begin,array_begin', is_lvalue := false;
          when FIND_IN_SET(v_state, 'string,number,alpha') and not is_lvalue then set expect_state := 'comma,object_end,array_end', is_rvalue := true;
          when v_state = 'object_end' then set expect_state := 'comma,object_end,array_end', scope_stack := substring(scope_stack, 2);
          when v_state = 'array_end' then set expect_state := 'comma,object_end,array_end', scope_stack := substring(scope_stack, 2);
          when v_state = 'comma' and left(scope_stack, 1) = 'o' then set expect_state := 'string', is_lvalue := true;
          when v_state = 'comma' and left(scope_stack, 1) = 'a' then set expect_state := 'string,object_begin', is_lvalue := false;
        end case;
        set xml_node := unquote(xml_node);
        if v_state = 'object_begin' then 
          if substring_index(xml_nodes, ',', 1) != '' then
            set xml := concat(xml, '<', substring_index(xml_nodes, ',', 1), '>');
          end if;
          set xml_nodes := concat(',', xml_nodes);
        end if;
        if v_state = 'string' and is_lvalue then
          if left(xml_nodes, 1) = ',' then
            set xml_nodes := concat(xml_node, xml_nodes);
          else
            set xml_nodes := concat(xml_node, substring(xml_nodes, locate(',', xml_nodes)));
          end if;
        end if;
        if is_rvalue then
          set xml := concat(xml, '<', xml_node, '>', encode_xml(unquote(v_token)), '</', xml_node, '>');
        end if;
        if v_state = 'object_end' then 
          set xml_nodes := substring(xml_nodes, locate(',', xml_nodes) + 1);
          if substring_index(xml_nodes, ',', 1) != '' then
            set xml := concat(xml, '</', substring_index(xml_nodes, ',', 1), '>');
          end if;
        end if;
    until 
        v_old_from = v_from
    end repeat;
    return xml;
end;
//

delimiter ;



--
--
--

delimiter //

drop function if exists extract_json_value//

create function extract_json_value(
    json_text text charset utf8,
    xpath text charset utf8
) returns text charset utf8
comment 'Extracts JSON value via XPath'
language SQL
deterministic
modifies sql data
sql security invoker
begin
  return ExtractValue(json_to_xml(json_text), xpath); 
end;
//

delimiter ;


                   --
-- Trim white space characters on both sides of text.
-- As opposed to the standard TRIM() function, which only trims
-- strict space characters (' '), trim_wspace() also trims new line, 
-- tab and backspace characters
--
-- example:
--
-- SELECT trim_wspace('\n a b c \n  ')
-- Returns: 'a b c'
-- 

DELIMITER $$

DROP FUNCTION IF EXISTS trim_wspace $$
CREATE FUNCTION trim_wspace(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8 
DETERMINISTIC
NO SQL
SQL SECURITY INVOKER
COMMENT 'Trim whitespace characters on both sides'

begin
  declare len INT UNSIGNED DEFAULT 0;
  declare done TINYINT UNSIGNED DEFAULT 0;
  if txt IS NULL then
    return txt;
  end if;
  while not done do
    set len := CHAR_LENGTH(txt);
    set txt = trim(' ' FROM txt);
    set txt = trim('\r' FROM txt);
    set txt = trim('\n' FROM txt);
    set txt = trim('\t' FROM txt);
    set txt = trim('\b' FROM txt);
    if CHAR_LENGTH(txt) = len then
      set done := 1;
    end if;
  end while;
  return txt;
end $$

DELIMITER ;


delimiter //

set names utf8
//

drop procedure if exists _get_json_token;
//

create procedure _get_json_token(
    in      p_text      text charset utf8
,   inout   p_from      int unsigned
,   inout   p_level     int
,   out     p_token     text charset utf8
,   in      allow_script_tokens int
,   inout   p_state     enum(
                            'alpha'
                        ,   'alphanum'
                        ,   'colon'
                        ,   'comma'                        
                        ,   'decimal'
                        ,   'error'
                        ,   'integer'
                        ,   'number'
                        ,   'minus'
                        ,   'object_begin'
                        ,   'object_end'
                        ,   'array_begin'
                        ,   'array_end'
                        ,   'start'
                        ,   'string'
                        ,   'whitespace'
                        )               
)
comment 'Reads a token according to lexical rules for JSON'
language SQL
deterministic
no sql
sql security invoker
begin    
    declare v_length int unsigned default character_length(p_text);
    declare v_char, v_lookahead, v_quote_char    varchar(1) charset utf8;
    declare v_from int unsigned;
    declare negative_number bool default false;

    if p_from is null then
        set p_from = 1;
    end if;
    if p_level is null then
        set p_level = 0;
    end if;
    if p_state = 'object_end' then
        set p_level = p_level - 1;
    end if;
    if p_state = 'array_end' and allow_script_tokens then
        set p_level = p_level - 1;
    end if;
    set v_from = p_from;
    
    set p_token = ''
    ,   p_state = 'start';
    my_loop: while v_from <= v_length do
        set v_char = substr(p_text, v_from, 1)
        ,   v_lookahead = substr(p_text, v_from+1, 1)
        ;
        if v_char = '-' then
            set negative_number := true, v_from = v_from + 1;
            iterate my_loop;
        end if;
        state_case: begin case p_state
            when 'error' then 
                set p_from = v_length;
                leave state_case;            
            when 'start' then
                case
                    when v_char = '-' then
                        set p_state = 'minus', v_from = v_from + 1;
                    when v_char between '0' and '9' then 
                        set p_state = 'integer';
                    when v_char between 'A' and 'Z' 
                    or   v_char between 'a' and 'z' 
                    or   v_char = '_' then
                        set p_state = 'alpha';                        
                    when v_char = ' ' then 
                        set p_state = 'whitespace'
                        ,   v_from = v_length - character_length(ltrim(substring(p_text, v_from)))
                        ;
                        leave state_case;
                    when v_char in ('\t', '\n', '\r') then 
                        set p_state = 'whitespace';
                    when v_char = '"' then
                        set p_state = 'string', v_quote_char = v_char;
                    when v_char = '.' then
                        if substr(p_text, v_from + 1, 1) between '0' and '9' then
                            set p_state = 'decimal', v_from = v_from + 1;
                        else
                            set p_state = 'error';
                            leave my_loop;
                        end if;
                    when v_char = ',' then
                        set p_state = 'comma', v_from = v_from + 1;
                        leave my_loop;
                    when v_char = ':' then 
                        set p_state = 'colon', v_from = v_from + 1;
                        leave my_loop;
                    when v_char = '{' then 
                        set p_state = 'object_begin', v_from = v_from + 1, p_level = p_level + 1;
                        leave my_loop;
                    when v_char = '}' then
                        set p_state = 'object_end', v_from = v_from + 1;
                        leave my_loop;
                    when v_char = '[' then 
                        set p_state = 'array_begin', v_from = v_from + 1, p_level = p_level + 1;
                        leave my_loop;
                    when v_char = ']' then 
                        set p_state = 'array_end', v_from = v_from + 1;
                        leave my_loop;
                    else 
                        set p_state = 'error';
                end case;
            when 'alpha' then 
                case
                    when v_char between 'A' and 'Z' 
                    or   v_char between 'a' and 'z' 
                    or   v_char = '_' then
                        leave state_case;
                    when v_char between '0' and '9' then 
                        set p_state = 'alphanum';
                    else
                        leave my_loop;
                end case;
            when 'alphanum' then 
                case
                    when v_char between 'A' and 'Z' 
                    or   v_char between 'a' and 'z' 
                    or   v_char = '_'
                    or   v_char between '0' and '9' then 
                        leave state_case;
                    else
                        leave my_loop;
                end case;
            when 'integer' then
                case 
                    when v_char between '0' and '9' then 
                        leave state_case;
                    when v_char = '.' then 
                        set p_state = 'decimal';
                    else
                        leave my_loop;                        
                end case;
            when 'decimal' then
                case 
                    when v_char between '0' and '9' then 
                        leave state_case;
                    else
                        leave my_loop;
                end case;
            when 'whitespace' then
                if v_char not in ('\t', '\n', '\r') then
                    leave my_loop;                        
                end if;
            when 'string' then
                set v_from = locate(v_quote_char, p_text, v_from);
                if v_from then
                    if substr(p_text, v_from + 1, 1) = v_quote_char then
                        set v_from = v_from + 1;
                    elseif substr(p_text, v_from - 1, 1) != '\\' then
                        set v_from = v_from + 1;
                        leave my_loop;
                    end if;
                else
                    set p_state = 'error';
                    leave my_loop;
                end if;
            else
                leave my_loop;            
        end case; end state_case;
        set v_from = v_from + 1;
    end while my_loop;
    set p_token = substr(p_text, p_from, v_from - p_from) collate utf8_general_ci;
    set p_from = v_from;
    if p_state in ('decimal', 'integer') then
      set p_state := 'number';
    end if;
    if p_state = 'alphanum' then
      set p_state := 'alpha';
    end if;
    if negative_number and (p_state != 'number') then
      set p_token := NULL;
    end if;
end;
//

delimiter ;


              --
-- Unquotes a given text.
-- Removes leading and trailing quoting characters (one of: "'/)
-- Unquoting works only if both leading and trailing character are identical.
-- There is no nesting or sub-unquoting.
--
-- example:
--
-- SELECT unquote('\"saying\"') 
-- Returns: 'saying'
--

DELIMITER $$

DROP FUNCTION IF EXISTS unquote $$
CREATE FUNCTION unquote(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8 
DETERMINISTIC
NO SQL
SQL SECURITY INVOKER
COMMENT 'Unquotes a given text'

begin
  declare quoting_char VARCHAR(1) CHARSET utf8;
  declare terminating_quote_escape_char VARCHAR(1) CHARSET utf8;
  declare current_pos INT UNSIGNED;
  declare end_quote_pos INT UNSIGNED;

  if CHAR_LENGTH(txt) < 2 then
    return txt;
  end if;
  
  set quoting_char := LEFT(txt, 1);
  if not quoting_char in ('''', '"', '`', '/') then
    return txt;
  end if;
  if txt in ('''''', '""', '``', '//') then
    return '';
  end if;
  
  set current_pos := 1;
  terminating_quote_loop: while current_pos > 0 do
    set current_pos := LOCATE(quoting_char, txt, current_pos + 1);
    if current_pos = 0 then
      -- No terminating quote
      return txt;
    end if;
    if SUBSTRING(txt, current_pos, 2) = REPEAT(quoting_char, 2) then
      set current_pos := current_pos + 1;
      iterate terminating_quote_loop;
    end if;
    set terminating_quote_escape_char := SUBSTRING(txt, current_pos - 1, 1);
    if (terminating_quote_escape_char = quoting_char) or (terminating_quote_escape_char = '\\') then
      -- This isn't really a quote end: the quote is escaped. 
      -- We do nothing; just a trivial assignment.
      iterate terminating_quote_loop;
    end if;
    -- Found terminating quote.
    leave terminating_quote_loop;
  end while;
  if current_pos = CHAR_LENGTH(txt) then
      return SUBSTRING(txt, 2, CHAR_LENGTH(txt) - 2);
    end if;
  return txt;
end $$

DELIMITER ;



--
-- Encode a given text for XML.
-- 

DELIMITER $$

DROP FUNCTION IF EXISTS encode_xml $$
CREATE FUNCTION encode_xml(txt TEXT CHARSET utf8) RETURNS TEXT CHARSET utf8 
DETERMINISTIC
NO SQL
SQL SECURITY INVOKER
COMMENT 'Encode (escape) given text for XML'

begin
  set txt := REPLACE(txt, '&', '&amp;');
  set txt := REPLACE(txt, '<', '&lt;');
  set txt := REPLACE(txt, '>', '&gt;');
  set txt := REPLACE(txt, '"', '&quot;');
  set txt := REPLACE(txt, '''', '&apos;');
  
  return txt;
end $$

DELIMITER ;