udf_flevtok.cc source: |
back to bertnase's stuff | |
/* * udf_flevtok.cc: 03'2005 * (schoenfr@gaertner.de) * * implements a user defined function for the mysql server. * * returned is a maximum similarity number. * it is calculated with help of the minimum levenshtein distance of a word * to the tokens of a string. * the tokens are seperated by whitespaces. * the maximum similarity ( 0.0 < x <= 1.0) is returned. * * this allows some fuzzy lookup of a word in a sentence. * * * g++ -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so \ * udf_flevtok.cc * * CREATE FUNCTION flevtok RETURNS REAL SONAME "udf_flevtok.so"; * DROP FUNCTION flevtok; * * use: int = levtok ( string words, string token, bool ignore_case) * * to get the minimum levenshtein distance for any word in words * checked against the token word. * * select flevtok("nase", "Nase", 1) -> 1.0 * select flevtok("nase", "Nase", 0) -> 0.75 * select flevtok("der hase mit der nase", "nase", 0) -> 1.0 * select flevtok("nas en bear", "nase", 0) -> 0.67 * select flevtok("hasen ohren", "nase", 0) -> 0.5 * select flevtok("nah", "", 0) -> 0 * select flevtok("a", NULL, 0) -> NULL * * see also: http://www.bertnase.de/html/stuff.html * * bugs: - if the token-word contains whitespaces the result is funny. */ char *udf_flevtok_version = "udf_flevtok version: v1.0d, 15.03.2005"; /* Copyright (C) 2005 schoenfr@gaertner.de This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifdef STANDALONE # include <stdio.h> # include <stdlib.h> # include <malloc.h> # include <ctype.h> # include <string.h> #else # ifdef STANDARD # include <stdio.h> # include <string.h> # ifdef __WIN__ typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */ typedef __int64 longlong; # else typedef unsigned long long ulonglong; typedef long long longlong; # endif /*__WIN__*/ # else # include <my_global.h> # include <my_sys.h> # endif # include <mysql.h> # include <m_ctype.h> # include <m_string.h> // To get strmov() #endif /* ! STANDALONE */ static double lev_tok (char *line, char *tok); /* this function is from: http://www.merriampark.com/ldc.htm */ static int levenshtein_distance(char *s,char*t); #define min2(a,b) ((a) < (b) ? (a) : (b)) #define max2(a,b) ((a) > (b) ? (a) : (b)) static void buf_tolower (char *s) { int i; while (*s) { *s = tolower (*s); s++; } } double lev_tok (char *buf, char *tok) { int buf_len = strlen(buf); int tok_len = strlen(tok); double ret_val; char *ptr = buf; int min_len = min2(tok_len, buf_len); /* * some specials: */ if (buf_len == tok_len && ! strcmp (buf, tok)) { return 1.0; } else if (buf_len == 0 || tok_len == 0) { return 0.0; } /* no similarity: */ ret_val = 0.0; while (*ptr) { char *p, old_char; int rc; /* * skip over white: */ while (*ptr && isspace(*ptr)) { ptr++; } // if (! *ptr) { // return ret_val; // } /* * scan end of token: */ p = ptr; while (*p && ! isspace(*p)) { p++; } old_char = *p; *p = 0; rc = levenshtein_distance (ptr, tok); if (! rc) { /* exact match: */ return 1.0; } if (rc != -1) { double f = 1.0 - (rc / (double) min2(strlen (ptr), tok_len)); if (f > ret_val) { ret_val = f; } } *p = old_char; ptr = p; } return ret_val; } #ifdef HAVE_DLOPEN extern "C" { my_bool flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message); void flevtok_deinit (UDF_INIT *initid); double flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); } my_bool flevtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message) { char **buf_ptr; if (args->arg_count != 3 || args->arg_type[0] != STRING_RESULT || args->arg_type[1] != STRING_RESULT || args->arg_type[2] != INT_RESULT) { strcpy (message, "FLEVTOK() requires three arguments (str text, str word, bool case)"); return 1; } initid->max_length = 11; // max result length (?) initid->maybe_null = 0; if (! (buf_ptr = (char **) malloc (2 *sizeof (char *))) || ! (buf_ptr [0] = (char *) malloc (args->lengths [0] + 1)) || ! (buf_ptr [1] = (char *) malloc (args->lengths [1] + 1))) { strmov (message, "flevtok: Couldn't allocate memory"); return 1; } initid->ptr = (char *) buf_ptr; return 0; } /**************************************************************************** ** Deinit function. This should free all resources allocated by ** this function. ** Arguments: ** initid Return value from xxxx_init ****************************************************************************/ void flevtok_deinit (UDF_INIT *initid) { if (initid->ptr) { char **buf_ptr = (char **) initid->ptr; free (buf_ptr [1]); free (buf_ptr [0]); free (initid->ptr); } } /*************************************************************************** ** UDF string function. ** Arguments: ** initid Structure filled by xxx_init ** args The same structure as to xxx_init. This structure ** contains values for all parameters. ** Note that the functions MUST check and convert all ** to the type it wants! Null values are represented by ** a NULL pointer ** result Possible buffer to save result. At least 255 byte long. ** length Pointer to length of the above buffer. In this the function ** should save the result length ** is_null If the result is null, one should store 1 here. ** error If something goes fatally wrong one should store 1 here. ** ** This function should return a pointer to the result string. ** Normally this is 'result' but may also be an alloced string. ***************************************************************************/ double flevtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { char **buf_ptr, *arg1, *arg2; longlong arg3; double rc = 0.0; *error = 0; arg1 = args->args [0]; arg2 = args->args [1]; arg3 = * ((longlong *) args->args[2]); if (! arg1 || ! arg2) { *is_null = 1; return rc; } buf_ptr = (char **) initid->ptr; /* * make sure we have a trailing zero byte, so we ignore * binary data at treat the arg as a zero terminated string. */ memcpy (buf_ptr [0], args->args [0], args->lengths [0]); buf_ptr [0] [args->lengths [0]] = 0; memcpy (buf_ptr [1], args->args [1], args->lengths [1]); buf_ptr [1] [args->lengths [1]] = 0; if (arg3 != 0) { buf_tolower (buf_ptr [0]); buf_tolower (buf_ptr [1]); } rc = lev_tok (buf_ptr [0], buf_ptr [1]); return rc; } #endif /* ! HAVE_DLOPEN */ #ifdef STANDALONE int main (int argc, char *argv[]) { double rc; char *tok; char line [1024]; if (argc != 2) { return 0; } tok = argv [1]; while (fgets (line, sizeof (line), stdin)) { if (line [strlen (line) - 1] == '\n') { line [strlen (line) - 1] = 0; } rc = lev_tok (line, tok); printf ("see: tok=.%s. line=.%s. -> %.3f\n", line, tok, rc); } return 0; } #endif /* * the remaining function is taken from: * http://www.merriampark.com/ldc.htm * Lorenzo Seidenari (sixmoney@virgilio.it) */ #define minimum(a,b,c) min2(min2((a),(b)),(c)) /****************************************/ /*Implementation of Levenshtein distance*/ /****************************************/ static int levenshtein_distance(char *s,char*t) /*Compute levenshtein distance between s and t*/ { //Step 1 int k,i,j,n,m,cost,*d,distance; n=strlen(s); m=strlen(t); if(n!=0&&m!=0) { d=(int *)malloc((sizeof(int))*(m+1)*(n+1)); m++; n++; //Step 2 for(k=0;k<n;k++) d[k]=k; for(k=0;k<m;k++) d[k*n]=k; //Step 3 and 4 for(i=1;i<n;i++) for(j=1;j<m;j++) { //Step 5 if(s[i-1]==t[j-1]) cost=0; else cost=1; //Step 6 d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost); } distance=d[n*m-1]; free(d); return distance; } else return -1; //a negative return value means that one or both strings are empty. } /* Local Variables: */ /* mode:c */ /* compile-command:"g++ -O -shared -I/usr/local/mysql/include/mysql -o udf_flevtok.so udf_flevtok.cc" */ /* End: */ /* end of udf_flevtok.cc */