udf_levtok.cc source: |
back to bertnase's stuff | |
/* * udf_levtok.cc: 03'2005 * (schoenfr@web.de) * * implements a user defined function for the mysql server. * * returned is the minimun levenshtein distance of a word * to the token of a string. the tokens are seperated by whitespaces. * the minimun distance is returned. * * this allows some fuzzy lookup of a word in a sentence. * * * g++ -shared -I/usr/local/mysql/include/mysql -o udf_levtok.so udf_levtok.cc * * CREATE FUNCTION levtok RETURNS INTEGER SONAME "udf_levtok.so"; * DROP FUNCTION levtok; * * use: int = levtok ( string words, string token, bool ignore_case) * * to get the minimum levenshtein distance for any word in words * checked against the token word. * * select levtok("nase", "Nase", 1) -> 0 * select levtok("nase", "Nase", 0) -> 1 * select levtok("der hase mit der nase", "nase", 0) -> 0 * select levtok("nas en bear", "nase", 0) -> 1 * select levtok("hasen ohren", "nase", 0) -> 2 * select levtok("nah", "", 0) -> 3 * select levtok("a", NULL, 0) -> NULL * * see also: http://www.bertnase.de/html/stuff.html * * bugs: - if the token-word contains whitespaces the result is funny. */ char *udf_levtok_version = "udf_levtok version: v2.0e, 13.03.2005"; /* Copyright (C) 2005 schoenfr@web.de This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #if defined(STANDALONE) # include <stdio.h> # include <stdlib.h> # include <malloc.h> # include <ctype.h> # include <string.h> #else # ifdef STANDARD # include <stdio.h> # include <string.h> # ifdef __WIN__ typedef unsigned __int64 ulonglong; /* Microsofts 64 bit types */ typedef __int64 longlong; # else typedef unsigned long long ulonglong; typedef long long longlong; # endif /*__WIN__*/ # else # include <my_global.h> # include <my_sys.h> # endif # include <mysql.h> # include <m_ctype.h> # include <m_string.h> // To get strmov() static int lev_tok (char *line, char *tok); static void buf_tolower (char *s) { int i; while (*s) { *s = tolower (*s); s++; } } #ifdef HAVE_DLOPEN extern "C" { my_bool levtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message); void levtok_deinit (UDF_INIT *initid); longlong levtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); } my_bool levtok_init (UDF_INIT *initid, UDF_ARGS *args, char *message) { char **buf_ptr; if (args->arg_count != 3 || args->arg_type[0] != STRING_RESULT || args->arg_type[1] != STRING_RESULT || args->arg_type[2] != INT_RESULT) { strcpy (message, "LEVTOK() requires three arguments (str text, str word, bool case)"); return 1; } initid->max_length = 11; // max result length (?) initid->maybe_null = 0; if (! (buf_ptr = (char **) malloc (2 *sizeof (char *))) || ! (buf_ptr [0] = (char *) malloc (args->lengths [0] + 1)) || ! (buf_ptr [1] = (char *) malloc (args->lengths [1] + 1))) { strmov (message, "levtok: Couldn't allocate memory"); return 1; } initid->ptr = (char *) buf_ptr; return 0; } /**************************************************************************** ** Deinit function. This should free all resources allocated by ** this function. ** Arguments: ** initid Return value from xxxx_init ****************************************************************************/ void levtok_deinit (UDF_INIT *initid) { if (initid->ptr) { char **buf_ptr = (char **) initid->ptr; free (buf_ptr [1]); free (buf_ptr [0]); free (initid->ptr); } } /*************************************************************************** ** UDF string function. ** Arguments: ** initid Structure filled by xxx_init ** args The same structure as to xxx_init. This structure ** contains values for all parameters. ** Note that the functions MUST check and convert all ** to the type it wants! Null values are represented by ** a NULL pointer ** result Possible buffer to save result. At least 255 byte long. ** length Pointer to length of the above buffer. In this the function ** should save the result length ** is_null If the result is null, one should store 1 here. ** error If something goes fatally wrong one should store 1 here. ** ** This function should return a pointer to the result string. ** Normally this is 'result' but may also be an alloced string. ***************************************************************************/ longlong levtok (UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { char **buf_ptr, *arg1, *arg2; longlong rc = 0, arg3; *error = 0; arg1 = args->args [0]; arg2 = args->args [1]; arg3 = * ((longlong *) args->args[2]); if (! arg1 || ! arg2) { *is_null = 1; return rc; } buf_ptr = (char **) initid->ptr; /* * make sure we have a trailing zero byte, so we ignore * binary data at treat the arg as a zero terminated string. */ memcpy (buf_ptr [0], args->args [0], args->lengths [0]); buf_ptr [0] [args->lengths [0]] = 0; memcpy (buf_ptr [1], args->args [1], args->lengths [1]); buf_ptr [1] [args->lengths [1]] = 0; if (arg3 != 0) { buf_tolower (buf_ptr [0]); buf_tolower (buf_ptr [1]); } rc = lev_tok (buf_ptr [0], buf_ptr [1]); return rc; } #endif /* ! HAVE_DLOPEN */ #endif /* ! STANDALONE */ /* this function is from: http://www.merriampark.com/ldc.htm */ static int levenshtein_distance(char *s,char*t); #define max2(a,b) ((a) > (b) ? (a) : (b)) int lev_tok (char *buf, char *tok) { int buf_len = strlen(buf); int tok_len = strlen(tok); int ret_val; char *ptr = buf; /* * some specials: */ if (buf_len == tok_len && ! strcmp (buf, tok)) { return 0; } else if (buf_len == 0) { return tok_len; } else if (tok_len == 0) { return buf_len; } /* maximum return value (no similarities): */ ret_val = max2(buf_len, tok_len); while (*ptr) { char *p, old_char; int rc; /* * skip over white: */ while (*ptr && isspace(*ptr)) { ptr++; } // if (! *ptr) { // return ret_val; // } /* * scan end of token: */ p = ptr; while (*p && ! isspace(*p)) { p++; } old_char = *p; *p = 0; rc = levenshtein_distance (ptr, tok); if (! rc) { /* exact match: */ return 0; } if (rc != -1 && rc < ret_val) { ret_val = rc; } *p = old_char; ptr = p; } return ret_val; } #ifdef STANDALONE int main (int argc, char *argv[]) { int rc; char *tok; char line [1024]; if (argc != 2) { return 0; } tok = argv [1]; while (fgets (line, sizeof (line), stdin)) { if (line [strlen (line) - 1] == '\n') { line [strlen (line) - 1] = 0; } rc = lev_tok (line, tok); printf ("see: tok=.%s. line=.%s. -> %d\n", line, tok, rc); } return rc; } #endif /* * the remaining function is taken from: * http://www.merriampark.com/ldc.htm * Lorenzo Seidenari (sixmoney@virgilio.it) */ #define min2(a,b) ((a) < (b) ? (a) : (b)) #define minimum(a,b,c) min2(min2((a),(b)),(c)) /****************************************/ /*Implementation of Levenshtein distance*/ /****************************************/ static int levenshtein_distance(char *s,char*t) /*Compute levenshtein distance between s and t*/ { //Step 1 int k,i,j,n,m,cost,*d,distance; n=strlen(s); m=strlen(t); if(n!=0&&m!=0) { d=(int *)malloc((sizeof(int))*(m+1)*(n+1)); m++; n++; //Step 2 for(k=0;k<n;k++) d[k]=k; for(k=0;k<m;k++) d[k*n]=k; //Step 3 and 4 for(i=1;i<n;i++) for(j=1;j<m;j++) { //Step 5 if(s[i-1]==t[j-1]) cost=0; else cost=1; //Step 6 d[j*n+i]=minimum(d[(j-1)*n+i]+1,d[j*n+i-1]+1,d[(j-1)*n+i-1]+cost); } distance=d[n*m-1]; free(d); return distance; } else return -1; //a negative return value means that one or both strings are empty. } /* Local Variables: */ /* mode:c */ /* compile-command:"g++ -O -shared -I/usr/local/mysql/include/mysql -o udf_levtok.so udf_levtok.cc" */ /* End: */ /* end of udf_levtok.cc */