@inproceedings{fc61a780c4354a9cbaa44130498a4c15,
title = "Comparative Analysis of Balanced Winnow and SVM in Large Scale Patent Categorization",
abstract = "This study investigates the effect of training different categorization algorithms on a corpus that is significantly larger than those reported in experiments in the literature. By means of machine learning techniques, a collection of 1.2 million patent applications is used to build a classifier that is able to classify documents with varyingly large feature spaces into the International Classification System (IPC) at Subclass level. The two algorithms that are compared are Balanced Winnow and Support Vector Machines (SVMs). Contrary to SVM, Balanced Winnow is frequently applied in today's patent categorization systems. Results show that SVM outperforms Winnow considerably on all four document representations that were tested. While Winnow results on the smallest sub-corpus do not necessarily hold for the full corpus, SVM results are more robust: they show smaller fluctuations in accuracy when smaller or larger feature spaces are used. The parameter tuning that was carried out for both algorithms con?rms this result. Although it is necessary to tune SVM experiments to optimize either recall or precision - whereas this can be combined when Winnow is used - e?ective parameter settings obtained on a small corpus can be used for training a larger corpus.",
keywords = "Patent Clasification, Intellectual Property, IPC taxonomy",
author = "Katrien Beuls and Bernhard Pflugfelder and Allan Hanbury",
year = "2010",
month = jan,
day = "26",
language = "English",
series = "Proceedings of the 10th Dutch-Belgian Information Retrieval Workshop",
pages = "8--15",
booktitle = "Proceedings of the 10th Dutch-Belgian Information Retrieval Workshop",
}