@inproceedings{3331687254ce4d7c8e6bbeee61218a71,
title = "Website Classification from Webpage Renders",
abstract = "In this paper, we present a fast and accurate method for the classification of web content. Our algorithm uses the visual information of the main homepage saved in an image format by means of a full body snapshot. Sliding windows of different sizes and overlaps are used to obtain a large subset of images for each render. For each sub-image, a feature vector is extracted by means of a pre-trained deep learning model. A Extreme Learning Machine (ELM) model is trained for different values of hidden neurons using the large collection of features from a curated dataset of 5979 webpages with different classes: adult, alcohol, dating, gambling, shopping, tobacco and weapons. Our results show that the ELM classifier can be trained without the manual specific object tagging of the sub-images by giving excellent results in comparison to more complex deep learning models. A random forest classifier was trained for the specific class of weapons providing an accuracy of 95% with a F1 score of 0.8.",
keywords = "113 Computer and information sciences",
author = "Leonardo Espinosa-Leal and Anton Akusok and Amaury Lendasse and Kaj-Mikael Bj{\"o}rk",
year = "2021",
doi = "10.1007/978-3-030-58989-9_5",
language = "English",
isbn = "978-3-030-58988-2",
series = "Proceedings in Adaptation, Learning and Optimization",
publisher = "Springer",
pages = "41--50",
editor = "Jiuwen Cao and Vong, {Chi Man} and Yoan Miche and Amaury Lendasse",
booktitle = "Proceedings of ELM2019",
address = "International",
}