Writing Python Scripts in Unicode
# -*- coding: utf-8 -*-
String Literals
# normal string
"Either single or double quotes can be used to quote strings."
'Either single or double quotes can be used to quote strings.'
# raw string
r"C:\Python26\README.txt"
# multiline string
txt = """
This is a multiline string literal
enclosed in triple double quotes.
"""
txt = '''
And this is a multiline string literal
enclosed in triple single quotes.
'''
# string formatting expressions
width = 80
height = 40
msg = "the size is %d x %d" % (width, height)
Basic String Operations
str = r"C:\Python26\README.txt"
# remove whitespace
str = str.rstrip()
# string length
len(str)
# fetch the last 4 characters
str[-4:] #>>> '.txt'
# end test
str.endswith(".txt")
# Split the string at the last occurrence of sep
str.rpartition("\\") #>>> ('C:\\Python26', '\\', 'README.txt')
Parsing a Text File
txtfile = open(r"C:\Python26\README.txt", "rt").readlines()
searchterm = ['python', 'documentation']
for ln, Line in enumerate(txtfile):
lline = Line.lower()
word_list = lline.split()
for term in searchterm:
if term in word_list:
col = lline.index( term )
print "Found '%s' in %d:%d" %(term, ln+1, col+1), Line,
Regular Expression
import re
txt = open(r"C:\Python26\README.txt", "rt").read()
emails = re.findall('[a-zA-Z0-9+_-.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+', txt)
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', txt)
print "emails=", emails, "\n"
print "urls=", urls, "\n"
Extract the title of a html page
import urllib, re
page = urllib.urlopen('http://www.google.com').read()
title = re.findall('(.*)', page)
print title
glob Module
import glob
searchterm = ['python', 'documentation']
files = glob.glob(r"C:\Python26\*.txt")
for filename in files:
fh = open(filename, "rt")
txt = fh.readlines()
for ln, Line in enumerate(txt):
lline = Line.lower()
for term in searchterm:
num_matches = lline.count(term)
if num_matches:
print "found '%s' %d times in %s on line %d." % (term, num_matches, filename, ln+1)
fh.close()
Read/Write a Unicode Text File
import codecs
f = codecs.open("test.txt", "r", "utf-8")
Reference