udf_flevtok.cc source:

  back to bertnase's stuff

/*
 * udf_flevtok.cc:					03'2005
 * (schoenfr@gaertner.de)
 *
 * implements a user defined function for the mysql server.
 *
 * returned is a maximum similarity number. 
 * it is calculated with help of the minimum levenshtein distance of a word 
 * to the tokens of a string.
 * the tokens are seperated by whitespaces.
 * the maximum similarity ( 0.0 < x <= 1.0) is returned.
 * 
 * this allows some fuzzy lookup of a word in a sentence. 
 *
 * 
 * g++ -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so \
 *	udf_flevtok.cc
 *
 * CREATE FUNCTION flevtok RETURNS REAL SONAME "udf_flevtok.so";
 * DROP FUNCTION flevtok;
 *
 * use:  int = levtok ( string words, string token, bool ignore_case)
 *
 *       to get the minimum levenshtein distance for any word in words
 *       checked against the token word.
 * 
 * select flevtok("nase", "Nase", 1)                    ->  1.0
 * select flevtok("nase", "Nase", 0)                    ->  0.75
 * select flevtok("der hase mit der nase", "nase", 0)   ->  1.0
 * select flevtok("nas en bear", "nase", 0)             ->  0.67
 * select flevtok("hasen ohren", "nase", 0)             ->  0.5
 * select flevtok("nah", "", 0)                         ->  0
 * select flevtok("a", NULL, 0)                         ->  NULL
 *
 * see also:  http://www.bertnase.de/html/stuff.html
 *
 * bugs:  - if the token-word contains whitespaces the result is funny.
 */

char *udf_flevtok_version = "udf_flevtok version: v1.0d, 15.03.2005";

/* Copyright (C) 2005 schoenfr@gaertner.de

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */


#ifdef STANDALONE
# include <stdio.h>
# include <stdlib.h>
# include <malloc.h>
# include <ctype.h>
# include <string.h>
#else
# ifdef STANDARD
#  include <stdio.h>
#  include <string.h>
#  ifdef __WIN__
typedef unsigned __int64 ulonglong;	/* Microsofts 64 bit types */
typedef __int64 longlong;
#  else
typedef unsigned long long ulonglong;
typedef long long longlong;
#  endif /*__WIN__*/
# else
#  include <my_global.h>
#  include <my_sys.h>
# endif
# include <mysql.h>
# include <m_ctype.h>
# include <m_string.h>		// To get strmov()
#endif /* ! STANDALONE */

static double lev_tok (char *line, char *tok);

/* this function is from: http://www.merriampark.com/ldc.htm */
static int levenshtein_distance(char *s,char*t);


#define min2(a,b)	((a) < (b) ? (a) : (b))
#define max2(a,b)	((a) > (b) ? (a) : (b))


static void
buf_tolower (char *s)
{
    int i;

    while (*s) {
      *s = tolower (*s);
      s++;
    }
}


double
lev_tok (char *buf, char *tok)
{
  int buf_len = strlen(buf);
  int tok_len = strlen(tok);
  double ret_val;
  char *ptr = buf;
  int min_len = min2(tok_len, buf_len);

  /* 
   * some specials:
   */
  if (buf_len == tok_len && ! strcmp (buf, tok)) {
    return 1.0;
  } else if (buf_len == 0 || tok_len == 0) {
    return 0.0;
  }

  /* no similarity: */
  ret_val = 0.0;

  while (*ptr) 
    {
      char *p, old_char;
      int rc;

      /*
       * skip over white:
       */
      while (*ptr && isspace(*ptr)) {
	ptr++;
      }
      //      if (! *ptr) {
      //	return ret_val;
      //  }

      /* 
       * scan end of token:
       */
      p = ptr;
      while (*p && ! isspace(*p)) {
	p++;
      }
      old_char = *p;
      *p = 0;

      rc = levenshtein_distance (ptr, tok);

      if (! rc) {
	/* exact match: */
	return 1.0;
      }
      if (rc != -1) {
	double f = 1.0 - (rc / (double) min2(strlen (ptr), tok_len));
	if (f > ret_val) {
	  ret_val = f;
	}
      }

      *p = old_char;
      ptr = p;
    }
  
  return ret_val;
}


#ifdef HAVE_DLOPEN

extern "C" {
  my_bool flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message);
  void flevtok_deinit (UDF_INIT *initid);
  double flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, 
		   char *error);
}


my_bool
flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message)
{
  char **buf_ptr;

  if (args->arg_count != 3
      || args->arg_type[0] != STRING_RESULT
      || args->arg_type[1] != STRING_RESULT
      || args->arg_type[2] != INT_RESULT)
    {
      strcpy (message,
          "FLEVTOK() requires three arguments (str text, str word, bool case)");
      return 1;
    }

  initid->max_length = 11;		// max result length (?)
  initid->maybe_null = 0;

  if (! (buf_ptr = (char **) malloc (2 *sizeof (char *)))
      || ! (buf_ptr [0] = (char *) malloc (args->lengths [0] + 1))
      || ! (buf_ptr [1] = (char *) malloc (args->lengths [1] + 1)))
    {
      strmov (message, "flevtok: Couldn't allocate memory");
      return 1;
    }
  initid->ptr = (char *) buf_ptr;

  return 0;
}

/****************************************************************************
** Deinit function. This should free all resources allocated by
** this function.
** Arguments:
** initid	Return value from xxxx_init
****************************************************************************/

void flevtok_deinit (UDF_INIT *initid)
{
  if (initid->ptr) {
    char **buf_ptr = (char **) initid->ptr;
    free (buf_ptr [1]);
    free (buf_ptr [0]);
    free (initid->ptr);
  }
}


/***************************************************************************
** UDF string function.
** Arguments:
** initid	Structure filled by xxx_init
** args		The same structure as to xxx_init. This structure
**		contains values for all parameters.
**		Note that the functions MUST check and convert all
**		to the type it wants!  Null values are represented by
**		a NULL pointer
** result	Possible buffer to save result. At least 255 byte long.
** length	Pointer to length of the above buffer.	In this the function
**		should save the result length
** is_null	If the result is null, one should store 1 here.
** error	If something goes fatally wrong one should store 1 here.
**
** This function should return a pointer to the result string.
** Normally this is 'result' but may also be an alloced string.
***************************************************************************/


double
flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error)
{
  char **buf_ptr, *arg1, *arg2;
  longlong arg3;
  double rc = 0.0;

  *error = 0;

  arg1 = args->args [0];
  arg2 = args->args [1];
  arg3 = * ((longlong *) args->args[2]);

  if (! arg1 || ! arg2)
    {
      *is_null = 1;
      return rc;
    }
  
  buf_ptr = (char **) initid->ptr;

  /*
   * make sure we have a trailing zero byte, so we ignore
   * binary data at treat the arg as a zero terminated string.
   */
  memcpy (buf_ptr [0], args->args [0], args->lengths [0]);
  buf_ptr [0] [args->lengths [0]] = 0;
  memcpy (buf_ptr [1], args->args [1], args->lengths [1]);
  buf_ptr [1] [args->lengths [1]] = 0;

  if (arg3 != 0) {
    buf_tolower (buf_ptr [0]);
    buf_tolower (buf_ptr [1]);
  }
  rc = lev_tok (buf_ptr [0], buf_ptr [1]);

  return rc;
}

#endif /* ! HAVE_DLOPEN */


#ifdef STANDALONE

int
main (int argc, char *argv[])
{
  double rc;
  char *tok;
  char line [1024];
  
  if (argc != 2) {
    return 0;
  }
  tok = argv [1];

  while (fgets (line, sizeof (line), stdin)) {
    if (line [strlen (line) - 1] == '\n') {
      line [strlen (line) - 1] = 0;
    }
    rc = lev_tok (line, tok);
    printf ("see: tok=.%s. line=.%s. -> %.3f\n", line, tok, rc);
  }

  return 0;
}
#endif

/* 
 * the remaining function is taken from:
 *   http://www.merriampark.com/ldc.htm
 *   Lorenzo Seidenari (sixmoney@virgilio.it)
 */
#define minimum(a,b,c)  min2(min2((a),(b)),(c))

/****************************************/
/*Implementation of Levenshtein distance*/
/****************************************/

static int
levenshtein_distance(char *s,char*t)
/*Compute levenshtein distance between s and t*/
{
  //Step 1
  int k,i,j,n,m,cost,*d,distance;
  n=strlen(s); 
  m=strlen(t);
  if(n!=0&&m!=0)
  {
    d=(int *)malloc((sizeof(int))*(m+1)*(n+1));
    m++;
    n++;
    //Step 2	
    for(k=0;k<n;k++)
	d[k]=k;
    for(k=0;k<m;k++)
      d[k*n]=k;
    //Step 3 and 4	
    for(i=1;i<n;i++)
      for(j=1;j<m;j++)
	{
        //Step 5
        if(s[i-1]==t[j-1])
          cost=0;
        else
          cost=1;
        //Step 6			 
        d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost);
      }
    distance=d[n*m-1];
    free(d);
    return distance;
  }
  else 
    return -1; //a negative return value means that one or both strings are empty.
}

/* Local Variables: */
/* mode:c */
/* compile-command:"g++ -O -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so udf_flevtok.cc" */
/* End: */

/* end of udf_flevtok.cc */