RegEx: Expresiones regulares

4/28/2014 - 5:03 PM

RegEx: Expresiones regulares

package Regex;

import java.text.Normalizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author Omar
 */
public class RegexUtils implements RexexConstantes{
    
    
    public static void main(String[] args) {
        
//        System.out.println(removeAccents("àèé Jos\u00E9 omar alberto martínez román "));
        
//        System.out.println(normalizeStringAscii("àèé Jos\u00E9 omar alberto martínez román £⌐← ?"));
//          System.out.println(removeNoAphanumericsAnsSpaces("àèé Jos\u00E9 omar alberto 125 martínez román £⌐← ?"));
        System.out.println(removeNumbers("ads2115fd51df"));
        System.out.println(removeLetter("ads2115fd51df"));
//        String userName = "iscomar001";
//        if (RegexUtils.validString(userName, REGEX_USER_NAME)) {
//            System.out.println("Valido");
//        } else {
//            System.out.println("Invalido");
//        }
    }
    
    
    /**
     * Valida si una cadena esta dentro de otra cadena mediante RegEx
     * @param cadena Cdena original donde buscar una palabra
     * @param regExStr Regex a buscar
     * @return Regresa true si esncuentra la palabra a buscar
     * Ex:
     *      validString("iscinvestigaciones@moovsolutions.com.mx", REGEX_EMAIL);
     *      validString("DOnde esta la palabra", "pal");
     */
    
    public static boolean validString(String cadena,String regExStr){
        Pattern pattern = Pattern.compile(regExStr);
        Matcher matcher = pattern.matcher(cadena);
        return matcher.matches();
    }
    
    /**
     * Valida si una cadena esta dentro de otra cadena mediante RegEx
     * @param cadena Cdena original donde buscar una palabra
     * @param regExStr Regex a buscar
     * @param caseInsensitive Identifica su buysca en froma case-sensitive
     * @return Regresa true si esncuentra la palabra a buscar
     * Ex:
     *      validString("iscinvestigaciones@moovsolutions.com.mx", REGEX_EMAIL);
     *      validString("DOnde esta la palabra", "pal");
     */
    public static boolean validString(String cadena,String regExStr,boolean caseInsensitive){
        
        Pattern pattern;
        if (caseInsensitive) {
            pattern = Pattern.compile(regExStr, Pattern.CASE_INSENSITIVE);
        } else {
            pattern = Pattern.compile(regExStr);
        }
        Matcher matcher = pattern.matcher(cadena);
        return matcher.matches();
    }
    
    /**
     * Elimina espacios  multiples en una cadena dejando solo uno si existen
     * @param cadena Cadena con espacios multiples
     * @return Cadena sin espacios multiples
     * ej:
     *      removeSpaceMultiples("Hola como      estas")
     *      out: Hola como estas
     */
    public static String removeSpacesMultiples(String cadena) {
        return cadena.replaceAll("[^A-Za-z0-9\\s.]", "");
    }
    
    /**
     * Remueve caracteres desconocidos
     * @param cadena Cadena a bsucar caracteres desconocidos
     * @return Cadena sin espacios desconocidos
     */
    public static String deleteUnknowChars(String cadena) {
        if (!Normalizer.isNormalized(cadena, Normalizer.Form.NFD)) {
            cadena = Normalizer.normalize(cadena, Normalizer.Form.NFD);
        }
        Pattern isMPattern = Pattern.compile("\\p{IsM}");
        // Esta es la parte que tendremos que hacer siempre
        cadena = isMPattern.matcher(cadena).replaceAll("");
        return cadena;
    }
    
    /**
     * Elimina y normaliza la cadena eliminando caracteres fuera del ascii y eliminando acentos
     * @param cadena Cadena a normalizar
     * @return Cadena normalizada
     */
    public static String normalizeStringAscii(String cadena){
        return Normalizer.normalize(cadena, Normalizer.Form.NFD).replaceAll("[^\\p{ASCII}]", "");
    }
    
    /**
     * Elimina acentos
     * @param cadena Cadena con acentos
     * @return Cadena sin acentos
     */
    public static String removeAccents(String cadena){
        return cadena == null ? null: Normalizer.normalize(cadena, Normalizer.Form.NFD ).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
    }
    
    /**
     * Elimina caracteres no alfanumericos y espacios
     * @param cadena Cadena con acentos
     * @return Cadena solo alfanumericos y espacios
     */
    public static String removeNoAphanumericsAnsSpaces(String cadena){
        return cadena.replaceAll("[^a-zA-Z0-9\\s]", "");
    }
    
     /**
     * Elimina caracteres no alfanumericos
     * @param cadena Cadena con acentos
     * @return Cadena solo alfanumericos
     */
    public static String removeNonAphanumerics(String cadena){
        return cadena.replaceAll("[^a-zA-Z0-9]", "");
    }
    
    /**
     * Deja solo numeros en una cadena
     * @param cadena Cadena con numeros ytletras
     * @return Cadena solo numeros
     */
    public static String removeLetter(String cadena){
        return cadena.replaceAll("\\D", "");
    }
    
    /**
     * Remueve numeros de una cadena
     * @param cadena Cadena con numeros
     * @return Cadena solo letras
     */
    public static String removeNumbers(String cadena){
        return cadena.replaceAll("\\d", "");
    }
    
    
}

RexexConstantes.java

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package Regex;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author Omar
 */
public interface RexexConstantes {
    
    /*
    ^           # Start of the line
    [a-z0-9_-]  # Match characters and symbols in the list, a-z, 0-9 , underscore , hyphen
    {3,15}      # Length at least 3 characters and maximum length of 15 
    $           # End of the line
    */
    public static final String REGEX_USER_NAME = "^[a-z0-9_-]{3,15}$";
    
    
    /*
    (           # Start of group
    (?=.*\d)	# must contains one digit from 0-9
    (?=.*[a-z])	# must contains one lowercase characters
    (?=.*[A-Z])	# must contains one uppercase characters
    (?=.*[@#$%])# must contains one special symbols in the list "@#$%"
    .		# match anything with previous condition checking
    {6,20}	# length at least 6 characters and maximum of 20	
    )		# End of group
    */
    public static final String REGEX_PASSWORD = "((?=.*\\d)(?=.*[a-z])(?=.*[A-Z])(?=.*[@#$%]).{6,20})";
    
    /*
    ^			#start of the line
    [_A-Za-z0-9-]+	#  must start with string in the bracket [ ], must contains one or more (+)
    (			#  start of group #1
    \\.[_A-Za-z0-9-]+	#     follow by a dot "." and string in the bracket [ ], must contains one or more (+)
    )*			#  end of group #1, this group is optional (*)
    @			#     must contains a "@" symbol
    [A-Za-z0-9]+        #        follow by string in the bracket [ ], must contains one or more (+)
    (			#	   start of group #2 - first level TLD checking
    \\.[A-Za-z0-9]+     #	     follow by a dot "." and string in the bracket [ ], must contains one or more (+)
    )*                  #	   end of group #2, this group is optional (*)
    (			#	   start of group #3 - second level TLD checking
    \\.[A-Za-z]{2,}     #	     follow by a dot "." and string in the bracket [ ], with minimum length of 2
    )			#	   end of group #3
    $			#end of the line
    */
    public static final String REGEX_EMAIL = "^[_A-Za-z0-9-]+(\\\\.[_A-Za-z0-9-]+)*@[A-Za-z0-9]+(\\\\.[A-Za-z0-9]+)*(\\\\.[A-Za-z]{2,})$";
    
    /*
    (			#Start of the group #1
    [^\s]+		#  must contains one or more anything (except white space)
          (		#    start of the group #2
            \.		#	follow by a dot "."
            (?i)	#	ignore the case sensitive checking
                (	#	  start of the group #3
                 jpg	#	    contains characters "jpg"
                 |	#	    ..or
                 png	#	    contains characters "png"
                 |	#	    ..or
                 gif	#	    contains characters "gif"
                 |	#	    ..or
                 bmp	#	    contains characters "bmp"
                )	#	  end of the group #3
          )		#     end of the group #2	
     $			#  end of the string
   )	
    */
    public static final String REGEX_IMAGE_FILE_NAME = "([^\\s]+(\\.(?i)(jpg|png|gif|bmp))$)";
    
    /*
    ^               #start of the line
    (               #  start of group #1
      [01]?\\d\\d?  #    Can be one or two digits. If three digits appear, it must start either 0 or 1
                    #    e.g ([0-9], [0-9][0-9],[0-1][0-9][0-9])
       |            #    ...or
      2[0-4]\\d     #    start with 2, follow by 0-4 and end with any digit (2[0-4][0-9]) 
       |            #    ...or
      25[0-5]       #    start with 2, follow by 5 and end with 0-5 (25[0-5]) 
    )               #  end of group #2
     \.             #  follow by a dot "."
   ....             # repeat with 3 time (3x)
   $                #end of the line
   )	
    */
    public static final String REGEX_IP_ADDRESS = "^([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])\\\\.([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])\\\\.([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])\\\\.([01]?\\\\d\\\\d?|2[0-4]\\\\d|25[0-5])$";
    
    
    /*
    (				#start of group #1
    1[012]			#  start with 10, 11, 12
    |				#  or
    [1-9]			#  start with 1,2,...9
   )				#end of group #1
    :				#    follow by a semi colon (:)
     [0-5][0-9]			#   follow by 0..5 and 0..9, which means 00 to 59
               (\\s)?		#        follow by a white space (optional)
                     (?i)	#          next checking is case insensitive
                         (am|pm)#            follow by am or pm
    */
    public static final String REGEX_HOUR_12H = "(1[012]|[1-9]):[0-5][0-9](\\\\s)?(?i)(am|pm)";
    
    /*
    (				#start of group #1
    [01]?[0-9]			#  start with 0-9,1-9,00-09,10-19
    |				#  or
    2[0-3]				#  start with 20-23
   )				#end of group #1
    :				#  follow by a semi colon (:)
     [0-5][0-9]			#    follow by 0..5 and 0..9, which means 00 to 59
    */
    public static final String REGEX_HOUR_24H = "([01]?[0-9]|2[0-3]):[0-5][0-9]";
    
    /*
    (			#start of group #1
    0?[1-9]		#  01-09 or 1-9
    |                  	#  ..or
    [12][0-9]		#  10-19 or 20-29
    |			#  ..or
    3[01]		#  30, 31
   ) 			#end of group #1
     /			#  follow by a "/"
      (			#    start of group #2
       0?[1-9]		#	01-09 or 1-9
       |		#	..or
       1[012]		#	10,11,12
       )		#    end of group #2
        /		#	follow by a "/"
         (		#	  start of group #3
          (19|20)\\d\\d	#	    19[0-9][0-9] or 20[0-9][0-9]
          )		#	  end of group #3
    */
    public static final String REGEX_DAY_DDMMYYYY = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\\\d\\\\d)";
    
    
    
}

Help

Character Classes

If you browse through the Pattern class specification, you'll see tables summarizing the supported regular expression constructs. In the "Character Classes" section you'll find the following:


Construct	Description
[abc]           a, b, or c (simple class)
[^abc]          Any character except a, b, or c (negation)
[a-zA-Z]	a through z, or A through Z, inclusive (range)
[a-d[m-p]]	a through d, or m through p: [a-dm-p] (union)
[a-z&&[def]]	d, e, or f (intersection)
[a-z&&[^bc]]	a through z, except for b and c: [ad-z] (subtraction)
[a-z&&[^m-p]]	a through z, and not m through p: [a-lq-z] (subtraction)


Predefined Character Classes

The Pattern API contains a number of useful predefined character classes, which offer convenient shorthands for commonly 
used regular expressions:


Construct	Description
.               Any character (may or may not match line terminators)
\d              A digit: [0-9]
\D              A non-digit: [^0-9]
\s              A whitespace character: [ \t\n\x0B\f\r]
\S              A non-whitespace character: [^\s]
\w              A word character: [a-zA-Z_0-9]
\W              A non-word character: [^\w]



Quantifiers

Quantifiers allow you to specify the number of occurrences to match against. For convenience, the three sections of the Pattern API 
specification describing greedy, reluctant, and possessive quantifiers are presented below. At first glance it may appear that the 
quantifiers X?, X?? and X?+ do exactly the same thing, since they all promise to match "X, once or not at all". There are subtle 
implementation differences which will be explained near the end of this section.

Greedy	Reluctant	Possessive	Meaning
X?	X??	X?+	X, once or not at all
X*	X*?	X*+	X, zero or more times
X+	X+?	X++	X, one or more times
X{n}	X{n}?	X{n}+	X, exactly n times
X{n,}	X{n,}?	X{n,}+	X, at least n times
X{n,m}	X{n,m}?	X{n,m}+	X, at least n but not more than m times


Capturing Groups

In the previous section, we saw how quantifiers attach to one character, character class, or capturing group at a time. But until now, we have not discussed the notion of capturing groups in any detail.

Capturing groups are a way to treat multiple characters as a single unit. They are created by placing the characters to be grouped inside a set of parentheses. For example, the regular expression (dog) creates a single group containing the letters "d" "o" and "g". The portion of the input string that matches the capturing group will be saved in memory for later recall via backreferences (as discussed below in the section, Backreferences).

Numbering

As described in the Pattern API, capturing groups are numbered by counting their opening parentheses from left to right. In the expression ((A)(B(C))), for example, there are four such groups:

((A)(B(C)))
(A)
(B(C))
(C)

Boundary Matchers

Until now, we've only been interested in whether or not a match is found at some location within a particular input string. We never cared about where in the string the match was taking place.

You can make your pattern matches more precise by specifying such information with boundary matchers. For example, maybe you're interested in finding a particular word, but only if it appears at the beginning or end of a line. Or maybe you want to know if the match is taking place on a word boundary, or at the end of the previous match.

The following table lists and explains all the boundary matchers.

Boundary Construct	Description
^	The beginning of a line
$	The end of a line
\b	A word boundary
\B	A non-word boundary
\A	The beginning of the input
\G	The end of the previous match
\Z	The end of the input but for the final terminator, if any
\z	The end of the input

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

RegEx: Expresiones regulares