1<?php
2
3// SPDX-FileCopyrightText: 2004-2023 Ryan Parman, Sam Sneddon, Ryan McCue
4// SPDX-License-Identifier: BSD-3-Clause
5
6declare(strict_types=1);
7
8namespace SimplePie\Content\Type;
9
10use InvalidArgumentException;
11use SimplePie\File;
12use SimplePie\HTTP\Response;
13
14/**
15 * Content-type sniffing
16 *
17 * Based on the rules in http://tools.ietf.org/html/draft-abarth-mime-sniff-06
18 *
19 * This is used since we can't always trust Content-Type headers, and is based
20 * upon the HTML5 parsing rules.
21 *
22 *
23 * This class can be overloaded with {@see \SimplePie\SimplePie::set_content_type_sniffer_class()}
24 */
25class Sniffer
26{
27 /**
28 * File object
29 *
30 * @var File|Response
31 */
32 public $file;
33
34 /**
35 * Create an instance of the class with the input file
36 *
37 * @param File|Response $file Input file
38 */
39 public function __construct(/* File */ $file)
40 {
41 if (!is_object($file) || !$file instanceof Response) {
42 // For BC we're asking for `File`, but internally we accept every `Response` implementation
43 throw new InvalidArgumentException(sprintf(
44 '%s(): Argument #1 ($file) must be of type %s',
45 __METHOD__,
46 File::class
47 ), 1);
48 }
49
50 $this->file = $file;
51 }
52
53 /**
54 * Get the Content-Type of the specified file
55 *
56 * @return string Actual Content-Type
57 */
58 public function get_type()
59 {
60 $content_type = $this->file->has_header('content-type') ? $this->file->get_header_line('content-type') : null;
61 $content_encoding = $this->file->has_header('content-encoding') ? $this->file->get_header_line('content-encoding') : null;
62 if ($content_type !== null) {
63 if ($content_encoding === null
64 && ($content_type === 'text/plain'
65 || $content_type === 'text/plain; charset=ISO-8859-1'
66 || $content_type === 'text/plain; charset=iso-8859-1'
67 || $content_type === 'text/plain; charset=UTF-8')) {
68 return $this->text_or_binary();
69 }
70
71 if (($pos = strpos($content_type, ';')) !== false) {
72 $official = substr($content_type, 0, $pos);
73 } else {
74 $official = $content_type;
75 }
76 $official = trim(strtolower($official));
77
78 if ($official === 'unknown/unknown'
79 || $official === 'application/unknown') {
80 return $this->unknown();
81 } elseif (substr($official, -4) === '+xml'
82 || $official === 'text/xml'
83 || $official === 'application/xml') {
84 return $official;
85 } elseif (substr($official, 0, 6) === 'image/') {
86 if ($return = $this->image()) {
87 return $return;
88 }
89
90 return $official;
91 } elseif ($official === 'text/html') {
92 return $this->feed_or_html();
93 }
94
95 return $official;
96 }
97
98 return $this->unknown();
99 }
100
101 /**
102 * Sniff text or binary
103 *
104 * @return string Actual Content-Type
105 */
106 public function text_or_binary()
107 {
108 $body = $this->file->get_body_content();
109
110 if (substr($body, 0, 2) === "\xFE\xFF"
111 || substr($body, 0, 2) === "\xFF\xFE"
112 || substr($body, 0, 4) === "\x00\x00\xFE\xFF"
113 || substr($body, 0, 3) === "\xEF\xBB\xBF") {
114 return 'text/plain';
115 } elseif (preg_match('/[\x00-\x08\x0E-\x1A\x1C-\x1F]/', $body)) {
116 return 'application/octet-stream';
117 }
118
119 return 'text/plain';
120 }
121
122 /**
123 * Sniff unknown
124 *
125 * @return string Actual Content-Type
126 */
127 public function unknown()
128 {
129 $body = $this->file->get_body_content();
130
131 $ws = strspn($body, "\x09\x0A\x0B\x0C\x0D\x20");
132 if (strtolower(substr($body, $ws, 14)) === '<!doctype html'
133 || strtolower(substr($body, $ws, 5)) === '<html'
134 || strtolower(substr($body, $ws, 7)) === '<script') {
135 return 'text/html';
136 } elseif (substr($body, 0, 5) === '%PDF-') {
137 return 'application/pdf';
138 } elseif (substr($body, 0, 11) === '%!PS-Adobe-') {
139 return 'application/postscript';
140 } elseif (substr($body, 0, 6) === 'GIF87a'
141 || substr($body, 0, 6) === 'GIF89a') {
142 return 'image/gif';
143 } elseif (substr($body, 0, 8) === "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A") {
144 return 'image/png';
145 } elseif (substr($body, 0, 3) === "\xFF\xD8\xFF") {
146 return 'image/jpeg';
147 } elseif (substr($body, 0, 2) === "\x42\x4D") {
148 return 'image/bmp';
149 } elseif (substr($body, 0, 4) === "\x00\x00\x01\x00") {
150 return 'image/vnd.microsoft.icon';
151 }
152
153 return $this->text_or_binary();
154 }
155
156 /**
157 * Sniff images
158 *
159 * @return string|false Actual Content-Type
160 */
161 public function image()
162 {
163 $body = $this->file->get_body_content();
164
165 if (substr($body, 0, 6) === 'GIF87a'
166 || substr($body, 0, 6) === 'GIF89a') {
167 return 'image/gif';
168 } elseif (substr($body, 0, 8) === "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A") {
169 return 'image/png';
170 } elseif (substr($body, 0, 3) === "\xFF\xD8\xFF") {
171 return 'image/jpeg';
172 } elseif (substr($body, 0, 2) === "\x42\x4D") {
173 return 'image/bmp';
174 } elseif (substr($body, 0, 4) === "\x00\x00\x01\x00") {
175 return 'image/vnd.microsoft.icon';
176 }
177
178 return false;
179 }
180
181 /**
182 * Sniff HTML
183 *
184 * @return string Actual Content-Type
185 */
186 public function feed_or_html()
187 {
188 $body = $this->file->get_body_content();
189
190 $len = strlen($body);
191 $pos = strspn($body, "\x09\x0A\x0D\x20\xEF\xBB\xBF");
192
193 while ($pos < $len) {
194 switch ($body[$pos]) {
195 case "\x09":
196 case "\x0A":
197 case "\x0D":
198 case "\x20":
199 $pos += strspn($body, "\x09\x0A\x0D\x20", $pos);
200 continue 2;
201
202 case '<':
203 $pos++;
204 break;
205
206 default:
207 return 'text/html';
208 }
209
210 if (substr($body, $pos, 3) === '!--') {
211 $pos += 3;
212 if ($pos < $len && ($pos = strpos($body, '-->', $pos)) !== false) {
213 $pos += 3;
214 } else {
215 return 'text/html';
216 }
217 } elseif (substr($body, $pos, 1) === '!') {
218 if ($pos < $len && ($pos = strpos($body, '>', $pos)) !== false) {
219 $pos++;
220 } else {
221 return 'text/html';
222 }
223 } elseif (substr($body, $pos, 1) === '?') {
224 if ($pos < $len && ($pos = strpos($body, '?>', $pos)) !== false) {
225 $pos += 2;
226 } else {
227 return 'text/html';
228 }
229 } elseif (substr($body, $pos, 3) === 'rss'
230 || substr($body, $pos, 7) === 'rdf:RDF') {
231 return 'application/rss+xml';
232 } elseif (substr($body, $pos, 4) === 'feed') {
233 return 'application/atom+xml';
234 } else {
235 return 'text/html';
236 }
237 }
238
239 return 'text/html';
240 }
241}
242
243class_alias('SimplePie\Content\Type\Sniffer', 'SimplePie_Content_Type_Sniffer');
244