Google translation script

Date:2008-06-04

In my projects, I have often tons of one-liners to translate. The best translation service I found yet, is, well, Google translation. But switching to the browser every next second becomes annoying rather fast. There does not seems to be any script automating this, so, I made a quick one, maybe it’ll help some people :p

Example

./tr.py de en hallo welt
Hello World

Code

#!/usr/bin/python
import sys, re
from htmlentitydefs import name2codepoint
from urllib import urlencode, FancyURLopener
from BeautifulSoup import BeautifulSoup

"""
Copyright (c) 2008 kang@insecure.ws
This program interact with a web translation to translate data in many languages
It is distributed under the terms of the GPL version 3.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see .

"""

class URLOpener(FancyURLopener):
    """Feel free to change the user-agent.
    This is necessary otherwise google will refuse to serve us"""
    version = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) Gecko/20080404 Iceweasel/2.0.0.14"

url = 'http://translate.google.de/translate_t'
post = { 'ie': 'UTF8',
    'text': 'hallo welt'
}
get = {
    'sl': 'de',
    'tl': 'en'
}

def translate(text, sl="de", tl="en"):
    urlopen = URLOpener()
    post['text'] = text
    get['sl'] = sl
    get['tl'] = tl
    page = urlopen.open(url+"?"+urlencode(get), urlencode(post))
    soup = BeautifulSoup(page.read())
    print htmldecode(soup.find('div', id='result_box').string)

def list2str(mylist):
"""Transform a list into a string without squarre brackets"""
    r = ""
    for i in mylist:
    r += i + " "
    return r

def htmldecode(text):
        """Decode HTML entities in the given text."""
        if type(text) is unicode:
                uchr = unichr
        else:
                uchr = lambda value: value > 255 and unichr(value) or chr(value)
        def entitydecode(match, uchr=uchr):
                entity = match.group(1)
                if entity.startswith('#x'):
                        return uchr(int(entity[2:], 16))
                elif entity.startswith('#'):
                        return uchr(int(entity[1:]))
                elif entity in name2codepoint:
                        return uchr(name2codepoint[entity])
                else:
                        return match.group(0)
        charrefpat = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?')
        return charrefpat.sub(entitydecode, text)

#In-line translation
if len(sys.argv) >= 4:
    translate(list2str(sys.argv[3:]), sys.argv[1], sys.argv[2])
    sys.exit()
#Stdin translation
elif len(sys.argv) == 3 :
    sl = sys.argv[1]
    tl = sys.argv[2]
else:
    print "USAGE: "+sys.argv[0]+"   [text]"
    print "\tsl: source language"
    print "\ttl: translate to language"
    print ""
    print "If no text is provided, will use stdin (abort with ^D or EOF)"
    print "Example:"
    print sys.argv[0]+" de en hallo welt"
    sys.exit()

while 1:
    try:
        text = raw_input("Text: ")
    except EOFError:
        print ""
        sys.exit()
    translate(text, sl, tl)