Files
edx-platform/i18n/segment.py
Ned Batchelder 1e8a0cb7e0 Tighten up the extraction I/O
We were writing the main file twice.  No need.
2014-01-24 16:18:43 -05:00

143 lines
4.5 KiB
Python
Executable File

#!/usr/bin/env python
"""
Segment a .po file to produce smaller files based on the locations of the
messages.
"""
import copy
import fnmatch
import logging
import sys
import polib
from i18n.config import CONFIGURATION
LOG = logging.getLogger(__name__)
def segment_pofiles(locale):
"""Segment all the pofiles for `locale`.
Returns a set of filenames, all the segment files written.
"""
files_written = set()
for filename, segments in CONFIGURATION.segment.items():
filename = CONFIGURATION.get_messages_dir(locale) / filename
files_written.update(segment_pofile(filename, segments))
return files_written
def segment_pofile(filename, segments):
"""Segment a .po file using patterns in `segments`.
The .po file at `filename` is read, and the occurrence locations of its
messages are examined. `segments` is a dictionary: the keys are segment
.po filenames, the values are lists of patterns::
{
'django-studio.po': [
'cms/*',
'some-other-studio-place/*',
],
'django-weird.po': [
'*/weird_*.*',
],
}
If all a message's occurrences match the patterns for a segment, then that
message is written to the new segmented .po file.
Any message that matches no segments, or more than one, is written back to
the original file.
Arguments:
filename (path.path): a path object referring to the original .po file.
segments (dict): specification of the segments to create.
Returns:
a set of path objects, all the segment files written.
"""
reading_msg = "Reading {num} entries from {file}"
writing_msg = "Writing {num} entries to {file}"
source_po = polib.pofile(filename)
LOG.info(reading_msg.format(file=filename, num=len(source_po)))
# A new pofile just like the source, but with no messages. We'll put
# anything not segmented into this file.
remaining_po = copy.deepcopy(source_po)
remaining_po[:] = []
# Turn the segments dictionary into two structures: segment_patterns is a
# list of (pattern, segmentfile) pairs. segment_po_files is a dict mapping
# segment file names to pofile objects of their contents.
segment_po_files = {filename: remaining_po}
segment_patterns = []
for segmentfile, patterns in segments.items():
segment_po_files[segmentfile] = copy.deepcopy(remaining_po)
segment_patterns.extend((pat, segmentfile) for pat in patterns)
# Examine each message in the source file. If all of its occurrences match
# a pattern for the same segment, it goes in that segment. Otherwise, it
# goes in remaining.
for msg in source_po:
msg_segments = set()
for occ_file, _ in msg.occurrences:
for pat, segment_file in segment_patterns:
if fnmatch.fnmatch(occ_file, pat):
msg_segments.add(segment_file)
break
else:
msg_segments.add(filename)
assert msg_segments
if len(msg_segments) == 1:
# This message belongs in this segment.
segment_file = msg_segments.pop()
segment_po_files[segment_file].append(msg)
else:
# It's in more than one segment, so put it back in the main file.
remaining_po.append(msg)
# Write out the results.
files_written = set()
for segment_file, pofile in segment_po_files.items():
out_file = filename.dirname() / segment_file
if len(pofile) == 0:
LOG.error("No messages to write to {file}, did you run segment twice?".format(file=out_file))
else:
LOG.info(writing_msg.format(file=out_file, num=len(pofile)))
pofile.save(out_file)
files_written.add(out_file)
return files_written
def main(argv):
"""
$ segment.py LOCALE [...]
Segment the .po files in LOCALE(s) based on the segmenting rules in
config.yaml.
Note that segmenting is *not* idempotent: it modifies the input file, so
be careful that you don't run it twice on the same file.
"""
# This is used as a tool only to segment translation files when adding a
# new segment. In the regular workflow, the work is done by the extract
# phase calling the functions above.
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
if len(argv) < 2:
sys.exit("Need a locale to segment")
for locale in argv[1:]:
segment_pofiles(locale)
if __name__ == "__main__":
main(sys.argv)